generator.hpp 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843
  1. // generator.hpp
  2. // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_GENERATOR_HPP
  7. #define BOOST_LEXER_GENERATOR_HPP
  8. #include "char_traits.hpp"
  9. // memcmp()
  10. #include <cstring>
  11. #include "partition/charset.hpp"
  12. #include "partition/equivset.hpp"
  13. #include <memory>
  14. #include <limits>
  15. #include "parser/tree/node.hpp"
  16. #include "parser/parser.hpp"
  17. #include "containers/ptr_list.hpp"
  18. #include <boost/move/unique_ptr.hpp>
  19. #include "rules.hpp"
  20. #include "state_machine.hpp"
  21. namespace boost
  22. {
  23. namespace lexer
  24. {
  25. template<typename CharT, typename Traits = char_traits<CharT> >
  26. class basic_generator
  27. {
  28. public:
  29. typedef typename detail::internals::size_t_vector size_t_vector;
  30. typedef basic_rules<CharT> rules;
  31. static void build (const rules &rules_,
  32. basic_state_machine<CharT> &state_machine_)
  33. {
  34. std::size_t index_ = 0;
  35. std::size_t size_ = rules_.statemap ().size ();
  36. node_ptr_vector node_ptr_vector_;
  37. detail::internals &internals_ = const_cast<detail::internals &>
  38. (state_machine_.data ());
  39. bool seen_BOL_assertion_ = false;
  40. bool seen_EOL_assertion_ = false;
  41. state_machine_.clear ();
  42. for (; index_ < size_; ++index_)
  43. {
  44. internals_._lookup->push_back (static_cast<size_t_vector *>(0));
  45. internals_._lookup->back () = new size_t_vector;
  46. internals_._dfa_alphabet.push_back (0);
  47. internals_._dfa->push_back (static_cast<size_t_vector *>(0));
  48. internals_._dfa->back () = new size_t_vector;
  49. }
  50. for (index_ = 0, size_ = internals_._lookup->size ();
  51. index_ < size_; ++index_)
  52. {
  53. internals_._lookup[index_]->resize (sizeof (CharT) == 1 ?
  54. num_chars : num_wchar_ts, dead_state_index);
  55. if (!rules_.regexes ()[index_].empty ())
  56. {
  57. // vector mapping token indexes to partitioned token index sets
  58. index_set_vector set_mapping_;
  59. // syntax tree
  60. detail::node *root_ = build_tree (rules_, index_,
  61. node_ptr_vector_, internals_, set_mapping_);
  62. build_dfa (root_, set_mapping_,
  63. internals_._dfa_alphabet[index_],
  64. *internals_._dfa[index_]);
  65. if (internals_._seen_BOL_assertion)
  66. {
  67. seen_BOL_assertion_ = true;
  68. }
  69. if (internals_._seen_EOL_assertion)
  70. {
  71. seen_EOL_assertion_ = true;
  72. }
  73. internals_._seen_BOL_assertion = false;
  74. internals_._seen_EOL_assertion = false;
  75. }
  76. }
  77. internals_._seen_BOL_assertion = seen_BOL_assertion_;
  78. internals_._seen_EOL_assertion = seen_EOL_assertion_;
  79. }
  80. static void minimise (basic_state_machine<CharT> &state_machine_)
  81. {
  82. detail::internals &internals_ = const_cast<detail::internals &>
  83. (state_machine_.data ());
  84. const std::size_t machines_ = internals_._dfa->size ();
  85. for (std::size_t i_ = 0; i_ < machines_; ++i_)
  86. {
  87. const std::size_t dfa_alphabet_ = internals_._dfa_alphabet[i_];
  88. size_t_vector *dfa_ = internals_._dfa[i_];
  89. if (dfa_alphabet_ != 0)
  90. {
  91. std::size_t size_ = 0;
  92. do
  93. {
  94. size_ = dfa_->size ();
  95. minimise_dfa (dfa_alphabet_, *dfa_, size_);
  96. } while (dfa_->size () != size_);
  97. }
  98. }
  99. }
  100. protected:
  101. typedef detail::basic_charset<CharT> charset;
  102. typedef detail::ptr_list<charset> charset_list;
  103. typedef boost::movelib::unique_ptr<charset> charset_ptr;
  104. typedef detail::equivset equivset;
  105. typedef detail::ptr_list<equivset> equivset_list;
  106. typedef boost::movelib::unique_ptr<equivset> equivset_ptr;
  107. typedef typename charset::index_set index_set;
  108. typedef std::vector<index_set> index_set_vector;
  109. typedef detail::basic_parser<CharT> parser;
  110. typedef typename parser::node_ptr_vector node_ptr_vector;
  111. typedef std::set<const detail::node *> node_set;
  112. typedef detail::ptr_vector<node_set> node_set_vector;
  113. typedef std::vector<const detail::node *> node_vector;
  114. typedef detail::ptr_vector<node_vector> node_vector_vector;
  115. typedef typename parser::string string;
  116. typedef std::pair<string, string> string_pair;
  117. typedef typename parser::tokeniser::string_token string_token;
  118. typedef std::deque<string_pair> macro_deque;
  119. typedef std::pair<string, const detail::node *> macro_pair;
  120. typedef typename parser::macro_map::iterator macro_iter;
  121. typedef std::pair<macro_iter, bool> macro_iter_pair;
  122. typedef typename parser::tokeniser::token_map token_map;
  123. static detail::node *build_tree (const rules &rules_,
  124. const std::size_t state_, node_ptr_vector &node_ptr_vector_,
  125. detail::internals &internals_, index_set_vector &set_mapping_)
  126. {
  127. size_t_vector *lookup_ = internals_._lookup[state_];
  128. const typename rules::string_deque_deque &regexes_ =
  129. rules_.regexes ();
  130. const typename rules::id_vector_deque &ids_ = rules_.ids ();
  131. const typename rules::id_vector_deque &unique_ids_ =
  132. rules_.unique_ids ();
  133. const typename rules::id_vector_deque &states_ = rules_.states ();
  134. typename rules::string_deque::const_iterator regex_iter_ =
  135. regexes_[state_].begin ();
  136. typename rules::string_deque::const_iterator regex_iter_end_ =
  137. regexes_[state_].end ();
  138. typename rules::id_vector::const_iterator ids_iter_ =
  139. ids_[state_].begin ();
  140. typename rules::id_vector::const_iterator unique_ids_iter_ =
  141. unique_ids_[state_].begin ();
  142. typename rules::id_vector::const_iterator states_iter_ =
  143. states_[state_].begin ();
  144. const typename rules::string &regex_ = *regex_iter_;
  145. // map of regex charset tokens (strings) to index
  146. token_map token_map_;
  147. const typename rules::string_pair_deque &macrodeque_ =
  148. rules_.macrodeque ();
  149. typename parser::macro_map macromap_;
  150. typename detail::node::node_vector tree_vector_;
  151. build_macros (token_map_, macrodeque_, macromap_,
  152. rules_.flags (), rules_.locale (), node_ptr_vector_,
  153. internals_._seen_BOL_assertion, internals_._seen_EOL_assertion);
  154. detail::node *root_ = parser::parse (regex_.c_str (),
  155. regex_.c_str () + regex_.size (), *ids_iter_, *unique_ids_iter_,
  156. *states_iter_, rules_.flags (), rules_.locale (), node_ptr_vector_,
  157. macromap_, token_map_, internals_._seen_BOL_assertion,
  158. internals_._seen_EOL_assertion);
  159. ++regex_iter_;
  160. ++ids_iter_;
  161. ++unique_ids_iter_;
  162. ++states_iter_;
  163. tree_vector_.push_back (root_);
  164. // build syntax trees
  165. while (regex_iter_ != regex_iter_end_)
  166. {
  167. // re-declare var, otherwise we perform an assignment..!
  168. const typename rules::string &regex2_ = *regex_iter_;
  169. root_ = parser::parse (regex2_.c_str (),
  170. regex2_.c_str () + regex2_.size (), *ids_iter_,
  171. *unique_ids_iter_, *states_iter_, rules_.flags (),
  172. rules_.locale (), node_ptr_vector_, macromap_, token_map_,
  173. internals_._seen_BOL_assertion,
  174. internals_._seen_EOL_assertion);
  175. tree_vector_.push_back (root_);
  176. ++regex_iter_;
  177. ++ids_iter_;
  178. ++unique_ids_iter_;
  179. ++states_iter_;
  180. }
  181. if (internals_._seen_BOL_assertion)
  182. {
  183. // Fixup BOLs
  184. typename detail::node::node_vector::iterator iter_ =
  185. tree_vector_.begin ();
  186. typename detail::node::node_vector::iterator end_ =
  187. tree_vector_.end ();
  188. for (; iter_ != end_; ++iter_)
  189. {
  190. fixup_bol (*iter_, node_ptr_vector_);
  191. }
  192. }
  193. // join trees
  194. {
  195. typename detail::node::node_vector::iterator iter_ =
  196. tree_vector_.begin ();
  197. typename detail::node::node_vector::iterator end_ =
  198. tree_vector_.end ();
  199. if (iter_ != end_)
  200. {
  201. root_ = *iter_;
  202. ++iter_;
  203. }
  204. for (; iter_ != end_; ++iter_)
  205. {
  206. node_ptr_vector_->push_back (static_cast<detail::selection_node *>(0));
  207. node_ptr_vector_->back () = new detail::selection_node
  208. (root_, *iter_);
  209. root_ = node_ptr_vector_->back ();
  210. }
  211. }
  212. // partitioned token list
  213. charset_list token_list_;
  214. set_mapping_.resize (token_map_.size ());
  215. partition_tokens (token_map_, token_list_);
  216. typename charset_list::list::const_iterator iter_ =
  217. token_list_->begin ();
  218. typename charset_list::list::const_iterator end_ =
  219. token_list_->end ();
  220. std::size_t index_ = 0;
  221. for (; iter_ != end_; ++iter_, ++index_)
  222. {
  223. const charset *cs_ = *iter_;
  224. typename charset::index_set::const_iterator set_iter_ =
  225. cs_->_index_set.begin ();
  226. typename charset::index_set::const_iterator set_end_ =
  227. cs_->_index_set.end ();
  228. fill_lookup (cs_->_token, lookup_, index_);
  229. for (; set_iter_ != set_end_; ++set_iter_)
  230. {
  231. set_mapping_[*set_iter_].insert (index_);
  232. }
  233. }
  234. internals_._dfa_alphabet[state_] = token_list_->size () + dfa_offset;
  235. return root_;
  236. }
  237. static void build_macros (token_map &token_map_,
  238. const macro_deque &macrodeque_,
  239. typename parser::macro_map &macromap_, const regex_flags flags_,
  240. const std::locale &locale_, node_ptr_vector &node_ptr_vector_,
  241. bool &seen_BOL_assertion_, bool &seen_EOL_assertion_)
  242. {
  243. for (typename macro_deque::const_iterator iter_ =
  244. macrodeque_.begin (), end_ = macrodeque_.end ();
  245. iter_ != end_; ++iter_)
  246. {
  247. const typename rules::string &name_ = iter_->first;
  248. const typename rules::string &regex_ = iter_->second;
  249. detail::node *node_ = parser::parse (regex_.c_str (),
  250. regex_.c_str () + regex_.size (), 0, 0, 0, flags_,
  251. locale_, node_ptr_vector_, macromap_, token_map_,
  252. seen_BOL_assertion_, seen_EOL_assertion_);
  253. macro_iter_pair map_iter_ = macromap_.
  254. insert (macro_pair (name_, static_cast<const detail::node *>
  255. (0)));
  256. map_iter_.first->second = node_;
  257. }
  258. }
  259. static void build_dfa (detail::node *root_,
  260. const index_set_vector &set_mapping_, const std::size_t dfa_alphabet_,
  261. size_t_vector &dfa_)
  262. {
  263. typename detail::node::node_vector *followpos_ =
  264. &root_->firstpos ();
  265. node_set_vector seen_sets_;
  266. node_vector_vector seen_vectors_;
  267. size_t_vector hash_vector_;
  268. // 'jam' state
  269. dfa_.resize (dfa_alphabet_, 0);
  270. closure (followpos_, seen_sets_, seen_vectors_,
  271. hash_vector_, dfa_alphabet_, dfa_);
  272. std::size_t *ptr_ = 0;
  273. for (std::size_t index_ = 0; index_ < seen_vectors_->size (); ++index_)
  274. {
  275. equivset_list equiv_list_;
  276. build_equiv_list (seen_vectors_[index_], set_mapping_, equiv_list_);
  277. for (typename equivset_list::list::const_iterator iter_ =
  278. equiv_list_->begin (), end_ = equiv_list_->end ();
  279. iter_ != end_; ++iter_)
  280. {
  281. equivset *equivset_ = *iter_;
  282. const std::size_t transition_ = closure
  283. (&equivset_->_followpos, seen_sets_, seen_vectors_,
  284. hash_vector_, dfa_alphabet_, dfa_);
  285. if (transition_ != npos)
  286. {
  287. ptr_ = &dfa_.front () + ((index_ + 1) * dfa_alphabet_);
  288. // Prune abstemious transitions from end states.
  289. if (*ptr_ && !equivset_->_greedy) continue;
  290. for (typename detail::equivset::index_vector::const_iterator
  291. equiv_iter_ = equivset_->_index_vector.begin (),
  292. equiv_end_ = equivset_->_index_vector.end ();
  293. equiv_iter_ != equiv_end_; ++equiv_iter_)
  294. {
  295. const std::size_t equiv_index_ = *equiv_iter_;
  296. if (equiv_index_ == bol_token)
  297. {
  298. if (ptr_[eol_index] == 0)
  299. {
  300. ptr_[bol_index] = transition_;
  301. }
  302. }
  303. else if (equiv_index_ == eol_token)
  304. {
  305. if (ptr_[bol_index] == 0)
  306. {
  307. ptr_[eol_index] = transition_;
  308. }
  309. }
  310. else
  311. {
  312. ptr_[equiv_index_ + dfa_offset] = transition_;
  313. }
  314. }
  315. }
  316. }
  317. }
  318. }
  319. static std::size_t closure (typename detail::node::node_vector *followpos_,
  320. node_set_vector &seen_sets_, node_vector_vector &seen_vectors_,
  321. size_t_vector &hash_vector_, const std::size_t size_,
  322. size_t_vector &dfa_)
  323. {
  324. bool end_state_ = false;
  325. std::size_t id_ = 0;
  326. std::size_t unique_id_ = npos;
  327. std::size_t state_ = 0;
  328. std::size_t hash_ = 0;
  329. if (followpos_->empty ()) return npos;
  330. std::size_t index_ = 0;
  331. boost::movelib::unique_ptr<node_set> set_ptr_ (new node_set);
  332. boost::movelib::unique_ptr<node_vector> vector_ptr_ (new node_vector);
  333. for (typename detail::node::node_vector::const_iterator iter_ =
  334. followpos_->begin (), end_ = followpos_->end ();
  335. iter_ != end_; ++iter_)
  336. {
  337. closure_ex (*iter_, end_state_, id_, unique_id_, state_,
  338. set_ptr_.get (), vector_ptr_.get (), hash_);
  339. }
  340. bool found_ = false;
  341. typename size_t_vector::const_iterator hash_iter_ =
  342. hash_vector_.begin ();
  343. typename size_t_vector::const_iterator hash_end_ =
  344. hash_vector_.end ();
  345. typename node_set_vector::vector::const_iterator set_iter_ =
  346. seen_sets_->begin ();
  347. for (; hash_iter_ != hash_end_; ++hash_iter_, ++set_iter_)
  348. {
  349. found_ = *hash_iter_ == hash_ && *(*set_iter_) == *set_ptr_;
  350. ++index_;
  351. if (found_) break;
  352. }
  353. if (!found_)
  354. {
  355. seen_sets_->push_back (static_cast<node_set *>(0));
  356. seen_sets_->back () = set_ptr_.release ();
  357. seen_vectors_->push_back (static_cast<node_vector *>(0));
  358. seen_vectors_->back () = vector_ptr_.release ();
  359. hash_vector_.push_back (hash_);
  360. // State 0 is the jam state...
  361. index_ = seen_sets_->size ();
  362. const std::size_t old_size_ = dfa_.size ();
  363. dfa_.resize (old_size_ + size_, 0);
  364. if (end_state_)
  365. {
  366. dfa_[old_size_] |= end_state;
  367. dfa_[old_size_ + id_index] = id_;
  368. dfa_[old_size_ + unique_id_index] = unique_id_;
  369. dfa_[old_size_ + state_index] = state_;
  370. }
  371. }
  372. return index_;
  373. }
  374. static void closure_ex (detail::node *node_, bool &end_state_,
  375. std::size_t &id_, std::size_t &unique_id_, std::size_t &state_,
  376. node_set *set_ptr_, node_vector *vector_ptr_, std::size_t &hash_)
  377. {
  378. const bool temp_end_state_ = node_->end_state ();
  379. if (temp_end_state_)
  380. {
  381. if (!end_state_)
  382. {
  383. end_state_ = true;
  384. id_ = node_->id ();
  385. unique_id_ = node_->unique_id ();
  386. state_ = node_->lexer_state ();
  387. }
  388. }
  389. if (set_ptr_->insert (node_).second)
  390. {
  391. vector_ptr_->push_back (node_);
  392. hash_ += reinterpret_cast<std::size_t> (node_);
  393. }
  394. }
  395. static void partition_tokens (const token_map &map_,
  396. charset_list &lhs_)
  397. {
  398. charset_list rhs_;
  399. fill_rhs_list (map_, rhs_);
  400. if (!rhs_->empty ())
  401. {
  402. typename charset_list::list::iterator iter_;
  403. typename charset_list::list::iterator end_;
  404. charset_ptr overlap_ (new charset);
  405. lhs_->push_back (static_cast<charset *>(0));
  406. lhs_->back () = rhs_->front ();
  407. rhs_->pop_front ();
  408. while (!rhs_->empty ())
  409. {
  410. charset_ptr r_ (rhs_->front ());
  411. rhs_->pop_front ();
  412. iter_ = lhs_->begin ();
  413. end_ = lhs_->end ();
  414. while (!r_->empty () && iter_ != end_)
  415. {
  416. typename charset_list::list::iterator l_iter_ = iter_;
  417. (*l_iter_)->intersect (*r_.get (), *overlap_.get ());
  418. if (overlap_->empty ())
  419. {
  420. ++iter_;
  421. }
  422. else if ((*l_iter_)->empty ())
  423. {
  424. delete *l_iter_;
  425. *l_iter_ = overlap_.release ();
  426. overlap_.reset (new charset);
  427. ++iter_;
  428. }
  429. else if (r_->empty ())
  430. {
  431. overlap_.swap (r_);
  432. overlap_.reset (new charset);
  433. break;
  434. }
  435. else
  436. {
  437. iter_ = lhs_->insert (++iter_,
  438. static_cast<charset *>(0));
  439. *iter_ = overlap_.release ();
  440. overlap_.reset(new charset);
  441. ++iter_;
  442. end_ = lhs_->end ();
  443. }
  444. }
  445. if (!r_->empty ())
  446. {
  447. lhs_->push_back (static_cast<charset *>(0));
  448. lhs_->back () = r_.release ();
  449. }
  450. }
  451. }
  452. }
  453. static void fill_rhs_list (const token_map &map_,
  454. charset_list &list_)
  455. {
  456. typename parser::tokeniser::token_map::const_iterator iter_ =
  457. map_.begin ();
  458. typename parser::tokeniser::token_map::const_iterator end_ =
  459. map_.end ();
  460. for (; iter_ != end_; ++iter_)
  461. {
  462. list_->push_back (static_cast<charset *>(0));
  463. list_->back () = new charset (iter_->first, iter_->second);
  464. }
  465. }
  466. static void fill_lookup (const string_token &token_,
  467. size_t_vector *lookup_, const std::size_t index_)
  468. {
  469. const CharT *curr_ = token_._charset.c_str ();
  470. const CharT *chars_end_ = curr_ + token_._charset.size ();
  471. std::size_t *ptr_ = &lookup_->front ();
  472. const std::size_t max_ = sizeof (CharT) == 1 ?
  473. num_chars : num_wchar_ts;
  474. if (token_._negated)
  475. {
  476. // $$$ FIXME JDG July 2014 $$$
  477. // this code is problematic on platforms where wchar_t is signed
  478. // with min generating negative numbers. This crashes with BAD_ACCESS
  479. // because of the vector index below:
  480. // ptr_[static_cast<typename Traits::index_type>(curr_char_)]
  481. CharT curr_char_ = 0; // (std::numeric_limits<CharT>::min)();
  482. std::size_t i_ = 0;
  483. while (curr_ < chars_end_)
  484. {
  485. while (*curr_ > curr_char_)
  486. {
  487. ptr_[static_cast<typename Traits::index_type>
  488. (curr_char_)] = index_ + dfa_offset;
  489. ++curr_char_;
  490. ++i_;
  491. }
  492. ++curr_char_;
  493. ++curr_;
  494. ++i_;
  495. }
  496. for (; i_ < max_; ++i_)
  497. {
  498. ptr_[static_cast<typename Traits::index_type>(curr_char_)] =
  499. index_ + dfa_offset;
  500. ++curr_char_;
  501. }
  502. }
  503. else
  504. {
  505. while (curr_ < chars_end_)
  506. {
  507. ptr_[static_cast<typename Traits::index_type>(*curr_)] =
  508. index_ + dfa_offset;
  509. ++curr_;
  510. }
  511. }
  512. }
  513. static void build_equiv_list (const node_vector *vector_,
  514. const index_set_vector &set_mapping_, equivset_list &lhs_)
  515. {
  516. equivset_list rhs_;
  517. fill_rhs_list (vector_, set_mapping_, rhs_);
  518. if (!rhs_->empty ())
  519. {
  520. typename equivset_list::list::iterator iter_;
  521. typename equivset_list::list::iterator end_;
  522. equivset_ptr overlap_ (new equivset);
  523. lhs_->push_back (static_cast<equivset *>(0));
  524. lhs_->back () = rhs_->front ();
  525. rhs_->pop_front ();
  526. while (!rhs_->empty ())
  527. {
  528. equivset_ptr r_ (rhs_->front ());
  529. rhs_->pop_front ();
  530. iter_ = lhs_->begin ();
  531. end_ = lhs_->end ();
  532. while (!r_->empty () && iter_ != end_)
  533. {
  534. typename equivset_list::list::iterator l_iter_ = iter_;
  535. (*l_iter_)->intersect (*r_.get (), *overlap_.get ());
  536. if (overlap_->empty ())
  537. {
  538. ++iter_;
  539. }
  540. else if ((*l_iter_)->empty ())
  541. {
  542. delete *l_iter_;
  543. *l_iter_ = overlap_.release ();
  544. overlap_.reset (new equivset);
  545. ++iter_;
  546. }
  547. else if (r_->empty ())
  548. {
  549. overlap_.swap (r_);
  550. overlap_.reset (new equivset);
  551. break;
  552. }
  553. else
  554. {
  555. iter_ = lhs_->insert (++iter_,
  556. static_cast<equivset *>(0));
  557. *iter_ = overlap_.release ();
  558. overlap_.reset (new equivset);
  559. ++iter_;
  560. end_ = lhs_->end ();
  561. }
  562. }
  563. if (!r_->empty ())
  564. {
  565. lhs_->push_back (static_cast<equivset *>(0));
  566. lhs_->back () = r_.release ();
  567. }
  568. }
  569. }
  570. }
  571. static void fill_rhs_list (const node_vector *vector_,
  572. const index_set_vector &set_mapping_, equivset_list &list_)
  573. {
  574. typename node_vector::const_iterator iter_ =
  575. vector_->begin ();
  576. typename node_vector::const_iterator end_ =
  577. vector_->end ();
  578. for (; iter_ != end_; ++iter_)
  579. {
  580. const detail::node *node_ = *iter_;
  581. if (!node_->end_state ())
  582. {
  583. const std::size_t token_ = node_->token ();
  584. if (token_ != null_token)
  585. {
  586. list_->push_back (static_cast<equivset *>(0));
  587. if (token_ == bol_token || token_ == eol_token)
  588. {
  589. std::set<std::size_t> index_set_;
  590. index_set_.insert (token_);
  591. list_->back () = new equivset (index_set_,
  592. node_->greedy (), token_, node_->followpos ());
  593. }
  594. else
  595. {
  596. list_->back () = new equivset (set_mapping_[token_],
  597. node_->greedy (), token_, node_->followpos ());
  598. }
  599. }
  600. }
  601. }
  602. }
  603. static void fixup_bol (detail::node * &root_,
  604. node_ptr_vector &node_ptr_vector_)
  605. {
  606. typename detail::node::node_vector *first_ = &root_->firstpos ();
  607. bool found_ = false;
  608. typename detail::node::node_vector::const_iterator iter_ =
  609. first_->begin ();
  610. typename detail::node::node_vector::const_iterator end_ =
  611. first_->end ();
  612. for (; iter_ != end_; ++iter_)
  613. {
  614. const detail::node *node_ = *iter_;
  615. found_ = !node_->end_state () && node_->token () == bol_token;
  616. if (found_) break;
  617. }
  618. if (!found_)
  619. {
  620. node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0));
  621. node_ptr_vector_->back () = new detail::leaf_node
  622. (bol_token, true);
  623. detail::node *lhs_ = node_ptr_vector_->back ();
  624. node_ptr_vector_->push_back (static_cast<detail::leaf_node *>(0));
  625. node_ptr_vector_->back () = new detail::leaf_node
  626. (null_token, true);
  627. detail::node *rhs_ = node_ptr_vector_->back ();
  628. node_ptr_vector_->push_back
  629. (static_cast<detail::selection_node *>(0));
  630. node_ptr_vector_->back () =
  631. new detail::selection_node (lhs_, rhs_);
  632. lhs_ = node_ptr_vector_->back ();
  633. node_ptr_vector_->push_back
  634. (static_cast<detail::sequence_node *>(0));
  635. node_ptr_vector_->back () =
  636. new detail::sequence_node (lhs_, root_);
  637. root_ = node_ptr_vector_->back ();
  638. }
  639. }
  640. static void minimise_dfa (const std::size_t dfa_alphabet_,
  641. size_t_vector &dfa_, std::size_t size_)
  642. {
  643. const std::size_t *first_ = &dfa_.front ();
  644. const std::size_t *second_ = 0;
  645. const std::size_t *end_ = first_ + size_;
  646. std::size_t index_ = 1;
  647. std::size_t new_index_ = 1;
  648. std::size_t curr_index_ = 0;
  649. index_set index_set_;
  650. size_t_vector lookup_;
  651. std::size_t *lookup_ptr_ = 0;
  652. lookup_.resize (size_ / dfa_alphabet_, null_token);
  653. lookup_ptr_ = &lookup_.front ();
  654. *lookup_ptr_ = 0;
  655. // Only one 'jam' state, so skip it.
  656. first_ += dfa_alphabet_;
  657. for (; first_ < end_; first_ += dfa_alphabet_, ++index_)
  658. {
  659. for (second_ = first_ + dfa_alphabet_, curr_index_ = index_ + 1;
  660. second_ < end_; second_ += dfa_alphabet_, ++curr_index_)
  661. {
  662. if (index_set_.find (curr_index_) != index_set_.end ())
  663. {
  664. continue;
  665. }
  666. // Some systems have memcmp in namespace std.
  667. using namespace std;
  668. if (memcmp (first_, second_, sizeof (std::size_t) *
  669. dfa_alphabet_) == 0)
  670. {
  671. index_set_.insert (curr_index_);
  672. lookup_ptr_[curr_index_] = new_index_;
  673. }
  674. }
  675. if (lookup_ptr_[index_] == null_token)
  676. {
  677. lookup_ptr_[index_] = new_index_;
  678. ++new_index_;
  679. }
  680. }
  681. if (!index_set_.empty ())
  682. {
  683. const std::size_t *front_ = &dfa_.front ();
  684. size_t_vector new_dfa_ (front_, front_ + dfa_alphabet_);
  685. typename index_set::iterator set_end_ =
  686. index_set_.end ();
  687. const std::size_t *ptr_ = front_ + dfa_alphabet_;
  688. std::size_t *new_ptr_ = 0;
  689. new_dfa_.resize (size_ - index_set_.size () * dfa_alphabet_, 0);
  690. new_ptr_ = &new_dfa_.front () + dfa_alphabet_;
  691. size_ /= dfa_alphabet_;
  692. for (index_ = 1; index_ < size_; ++index_)
  693. {
  694. if (index_set_.find (index_) != set_end_)
  695. {
  696. ptr_ += dfa_alphabet_;
  697. continue;
  698. }
  699. new_ptr_[end_state_index] = ptr_[end_state_index];
  700. new_ptr_[id_index] = ptr_[id_index];
  701. new_ptr_[unique_id_index] = ptr_[unique_id_index];
  702. new_ptr_[state_index] = ptr_[state_index];
  703. new_ptr_[bol_index] = lookup_ptr_[ptr_[bol_index]];
  704. new_ptr_[eol_index] = lookup_ptr_[ptr_[eol_index]];
  705. new_ptr_ += dfa_offset;
  706. ptr_ += dfa_offset;
  707. for (std::size_t i_ = dfa_offset; i_ < dfa_alphabet_; ++i_)
  708. {
  709. *new_ptr_++ = lookup_ptr_[*ptr_++];
  710. }
  711. }
  712. dfa_.swap (new_dfa_);
  713. }
  714. }
  715. };
  716. typedef basic_generator<char> generator;
  717. typedef basic_generator<wchar_t> wgenerator;
  718. }
  719. }
  720. #endif