state_machine.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431
  1. // state_machine.hpp
  2. // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_STATE_MACHINE_HPP
  7. #define BOOST_LEXER_STATE_MACHINE_HPP
  8. #include <algorithm>
  9. #include "conversion/char_state_machine.hpp"
  10. #include "consts.hpp"
  11. #include <deque>
  12. #include "internals.hpp"
  13. #include <map>
  14. #include "containers/ptr_vector.hpp"
  15. #include "size_t.hpp"
  16. #include <string>
  17. namespace boost
  18. {
  19. namespace lexer
  20. {
  21. template<typename CharT>
  22. class basic_state_machine
  23. {
  24. public:
  25. typedef CharT char_type;
  26. class iterator
  27. {
  28. public:
  29. friend class basic_state_machine;
  30. struct data
  31. {
  32. // Current iterator info
  33. std::size_t dfa;
  34. std::size_t states;
  35. std::size_t state;
  36. std::size_t transitions;
  37. std::size_t transition;
  38. // Current state info
  39. bool end_state;
  40. std::size_t id;
  41. std::size_t unique_id;
  42. std::size_t goto_dfa;
  43. std::size_t bol_index;
  44. std::size_t eol_index;
  45. // Current transition info
  46. basic_string_token<CharT> token;
  47. std::size_t goto_state;
  48. data () :
  49. dfa (npos),
  50. states (0),
  51. state (npos),
  52. transitions (0),
  53. transition (npos),
  54. end_state (false),
  55. id (npos),
  56. unique_id (npos),
  57. goto_dfa (npos),
  58. bol_index (npos),
  59. eol_index (npos),
  60. goto_state (npos)
  61. {
  62. }
  63. bool operator == (const data &rhs_) const
  64. {
  65. return dfa == rhs_.dfa &&
  66. states == rhs_.states &&
  67. state == rhs_.state &&
  68. transitions == rhs_.transitions &&
  69. transition == rhs_.transition &&
  70. end_state == rhs_.end_state &&
  71. id == rhs_.id &&
  72. unique_id == rhs_.unique_id &&
  73. goto_dfa == rhs_.goto_dfa &&
  74. bol_index == rhs_.bol_index &&
  75. eol_index == rhs_.eol_index &&
  76. token == rhs_.token &&
  77. goto_state == rhs_.goto_state;
  78. }
  79. };
  80. iterator () :
  81. _sm (0),
  82. _dfas (0),
  83. _dfa (npos),
  84. _states (0),
  85. _state (npos),
  86. _transitions (0),
  87. _transition (npos)
  88. {
  89. }
  90. bool operator == (const iterator &rhs_) const
  91. {
  92. return _dfas == rhs_._dfas && _dfa == rhs_._dfa &&
  93. _states == rhs_._states && _state == rhs_._state &&
  94. _transitions == rhs_._transitions &&
  95. _transition == rhs_._transition;
  96. }
  97. bool operator != (const iterator &rhs_) const
  98. {
  99. return !(*this == rhs_);
  100. }
  101. data &operator * ()
  102. {
  103. return _data;
  104. }
  105. data *operator -> ()
  106. {
  107. return &_data;
  108. }
  109. // Let compiler generate operator = ().
  110. // prefix version
  111. iterator &operator ++ ()
  112. {
  113. next ();
  114. return *this;
  115. }
  116. // postfix version
  117. iterator operator ++ (int)
  118. {
  119. iterator iter_ = *this;
  120. next ();
  121. return iter_;
  122. }
  123. void clear ()
  124. {
  125. _dfas = _states = _transitions = 0;
  126. _dfa = _state = _transition = npos;
  127. }
  128. private:
  129. basic_state_machine *_sm;
  130. data _data;
  131. std::size_t _dfas;
  132. std::size_t _dfa;
  133. std::size_t _states;
  134. std::size_t _state;
  135. std::size_t _transitions;
  136. std::size_t _transition;
  137. typename detail::basic_char_state_machine<CharT>::state::
  138. size_t_string_token_map::const_iterator _token_iter;
  139. typename detail::basic_char_state_machine<CharT>::state::
  140. size_t_string_token_map::const_iterator _token_end;
  141. void next ()
  142. {
  143. bool reset_state_ = false;
  144. if (_transition >= _transitions)
  145. {
  146. _transition = _data.transition = 0;
  147. _data.state = ++_state;
  148. reset_state_ = true;
  149. if (_state >= _states)
  150. {
  151. ++_dfa;
  152. if (_dfa >= _dfas)
  153. {
  154. clear ();
  155. reset_state_ = false;
  156. }
  157. else
  158. {
  159. _states = _data.states =
  160. _sm->_csm._sm_vector[_dfa].size ();
  161. _state = _data.state = 0;
  162. }
  163. }
  164. }
  165. else
  166. {
  167. _data.transition = _transition;
  168. }
  169. if (reset_state_)
  170. {
  171. const typename detail::basic_char_state_machine<CharT>::
  172. state *ptr_ = &_sm->_csm._sm_vector[_dfa][_state];
  173. _transitions = _data.transitions = ptr_->_transitions.size ();
  174. _data.end_state = ptr_->_end_state;
  175. _data.id = ptr_->_id;
  176. _data.unique_id = ptr_->_unique_id;
  177. _data.goto_dfa = ptr_->_state;
  178. _data.bol_index = ptr_->_bol_index;
  179. _data.eol_index = ptr_->_eol_index;
  180. _token_iter = ptr_->_transitions.begin ();
  181. _token_end = ptr_->_transitions.end ();
  182. }
  183. if (_token_iter != _token_end)
  184. {
  185. _data.token = _token_iter->second;
  186. _data.goto_state = _token_iter->first;
  187. ++_token_iter;
  188. ++_transition;
  189. }
  190. else
  191. {
  192. _data.token.clear ();
  193. _data.goto_state = npos;
  194. }
  195. }
  196. };
  197. friend class iterator;
  198. basic_state_machine ()
  199. {
  200. }
  201. void clear ()
  202. {
  203. _internals.clear ();
  204. _csm.clear ();
  205. }
  206. bool empty () const
  207. {
  208. // Don't include _csm in this test, as irrelevant to state.
  209. return _internals._lookup->empty () &&
  210. _internals._dfa_alphabet.empty () &&
  211. _internals._dfa->empty ();
  212. }
  213. std::size_t size () const
  214. {
  215. return _internals._dfa->size ();
  216. }
  217. bool operator == (const basic_state_machine &rhs_) const
  218. {
  219. // Don't include _csm in this test, as irrelevant to state.
  220. return _internals._lookup == rhs_._internals._lookup &&
  221. _internals._dfa_alphabet == rhs_._internals._dfa_alphabet &&
  222. _internals._dfa == rhs_._internals._dfa &&
  223. _internals._seen_BOL_assertion ==
  224. rhs_._internals._seen_BOL_assertion &&
  225. _internals._seen_EOL_assertion ==
  226. rhs_._internals._seen_EOL_assertion;
  227. }
  228. iterator begin () const
  229. {
  230. iterator iter_;
  231. iter_._sm = const_cast<basic_state_machine *>(this);
  232. check_for_csm ();
  233. if (!_csm.empty ())
  234. {
  235. const typename detail::basic_char_state_machine<CharT>::
  236. state_vector *ptr_ = &_csm._sm_vector.front ();
  237. iter_._dfas = _csm._sm_vector.size ();
  238. iter_._states = iter_._data.states = ptr_->size ();
  239. iter_._transitions = iter_._data.transitions =
  240. ptr_->front ()._transitions.size ();
  241. iter_._dfa = iter_._data.dfa = 0;
  242. iter_._state = iter_._data.state = 0;
  243. iter_._transition = 0;
  244. iter_._data.end_state = ptr_->front ()._end_state;
  245. iter_._data.id = ptr_->front ()._id;
  246. iter_._data.unique_id = ptr_->front ()._unique_id;
  247. iter_._data.goto_dfa = ptr_->front ()._state;
  248. iter_._data.bol_index = ptr_->front ()._bol_index;
  249. iter_._data.eol_index = ptr_->front ()._eol_index;
  250. iter_._token_iter = ptr_->front ()._transitions.begin ();
  251. iter_._token_end = ptr_->front ()._transitions.end ();
  252. // Deal with case where there is only a bol or eol
  253. // but no other transitions.
  254. if (iter_._transitions)
  255. {
  256. ++iter_;
  257. }
  258. }
  259. return iter_;
  260. }
  261. iterator end () const
  262. {
  263. iterator iter_;
  264. iter_._sm = const_cast<basic_state_machine *>(this);
  265. return iter_;
  266. }
  267. void swap (basic_state_machine &sm_)
  268. {
  269. _internals.swap (sm_._internals);
  270. _csm.swap (sm_._csm);
  271. }
  272. const detail::internals &data () const
  273. {
  274. return _internals;
  275. }
  276. private:
  277. detail::internals _internals;
  278. mutable detail::basic_char_state_machine<CharT> _csm;
  279. void check_for_csm () const
  280. {
  281. if (_csm.empty ())
  282. {
  283. human_readable (_csm);
  284. }
  285. }
  286. void human_readable (detail::basic_char_state_machine<CharT> &sm_) const
  287. {
  288. const std::size_t max_ = sizeof (CharT) == 1 ?
  289. num_chars : num_wchar_ts;
  290. const std::size_t start_states_ = _internals._dfa->size ();
  291. sm_.clear ();
  292. sm_._sm_vector.resize (start_states_);
  293. for (std::size_t start_state_index_ = 0;
  294. start_state_index_ < start_states_; ++start_state_index_)
  295. {
  296. const detail::internals::size_t_vector *lu_ =
  297. _internals._lookup[start_state_index_];
  298. const std::size_t alphabet_ =
  299. _internals._dfa_alphabet[start_state_index_] - dfa_offset;
  300. std::vector<std::basic_string<CharT> > chars_ (alphabet_);
  301. const std::size_t states_ = _internals._dfa[start_state_index_]->
  302. size () / (alphabet_ + dfa_offset);
  303. const std::size_t *read_ptr_ = &_internals.
  304. _dfa[start_state_index_]->front () + alphabet_ + dfa_offset;
  305. sm_._sm_vector[start_state_index_].resize (states_ - 1);
  306. for (std::size_t alpha_index_ = 0; alpha_index_ < max_;
  307. ++alpha_index_)
  308. {
  309. const std::size_t col_ = lu_->at (alpha_index_);
  310. if (col_ != static_cast<std::size_t>(dead_state_index))
  311. {
  312. chars_[col_ - dfa_offset] += static_cast<CharT>
  313. (alpha_index_);
  314. }
  315. }
  316. for (std::size_t state_index_ = 1; state_index_ < states_;
  317. ++state_index_)
  318. {
  319. typename detail::basic_char_state_machine<CharT>::state
  320. *state_ = &sm_._sm_vector[start_state_index_]
  321. [state_index_ - 1];
  322. state_->_end_state = *read_ptr_ != 0;
  323. state_->_id = *(read_ptr_ + id_index);
  324. state_->_unique_id = *(read_ptr_ + unique_id_index);
  325. state_->_state = *(read_ptr_ + state_index);
  326. state_->_bol_index = *(read_ptr_ + bol_index) - 1;
  327. state_->_eol_index = *(read_ptr_ + eol_index) - 1;
  328. read_ptr_ += dfa_offset;
  329. for (std::size_t col_index_ = 0; col_index_ < alphabet_;
  330. ++col_index_, ++read_ptr_)
  331. {
  332. const std::size_t transition_ = *read_ptr_;
  333. if (transition_ != 0)
  334. {
  335. const std::size_t i_ = transition_ - 1;
  336. typename detail::basic_char_state_machine<CharT>::
  337. state::size_t_string_token_map::iterator iter_ =
  338. state_->_transitions.find (i_);
  339. if (iter_ == state_->_transitions.end ())
  340. {
  341. basic_string_token<CharT> token_
  342. (false, chars_[col_index_]);
  343. typename detail::basic_char_state_machine<CharT>::
  344. state::size_t_string_token_pair pair_
  345. (i_, token_);
  346. state_->_transitions.insert (pair_);
  347. }
  348. else
  349. {
  350. iter_->second._charset += chars_[col_index_];
  351. }
  352. }
  353. }
  354. for (typename detail::basic_char_state_machine<CharT>::state::
  355. size_t_string_token_map::iterator iter_ =
  356. state_->_transitions.begin (),
  357. end_ = state_->_transitions.end ();
  358. iter_ != end_; ++iter_)
  359. {
  360. std::sort (iter_->second._charset.begin (),
  361. iter_->second._charset.end ());
  362. iter_->second.normalise ();
  363. }
  364. }
  365. }
  366. }
  367. };
  368. typedef basic_state_machine<char> state_machine;
  369. typedef basic_state_machine<wchar_t> wstate_machine;
  370. }
  371. }
  372. #endif