input.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529
  1. // input.hpp
  2. // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_INPUT
  7. #define BOOST_LEXER_INPUT
  8. #include "char_traits.hpp"
  9. #include "size_t.hpp"
  10. #include "state_machine.hpp"
  11. #include <iterator> // for std::iterator_traits
  12. namespace boost
  13. {
  14. namespace lexer
  15. {
  16. template<typename FwdIter, typename Traits =
  17. char_traits<typename std::iterator_traits<FwdIter>::value_type> >
  18. class basic_input
  19. {
  20. public:
  21. class iterator
  22. {
  23. public:
  24. friend class basic_input;
  25. struct data
  26. {
  27. std::size_t id;
  28. std::size_t unique_id;
  29. FwdIter start;
  30. FwdIter end;
  31. bool bol;
  32. std::size_t state;
  33. // Construct in end() state.
  34. data () :
  35. id (0),
  36. unique_id (npos),
  37. bol (false),
  38. state (npos)
  39. {
  40. }
  41. bool operator == (const data &rhs_) const
  42. {
  43. return id == rhs_.id && unique_id == rhs_.unique_id &&
  44. start == rhs_.start && end == rhs_.end &&
  45. bol == rhs_.bol && state == rhs_.state;
  46. }
  47. };
  48. iterator () :
  49. _input (0)
  50. {
  51. }
  52. bool operator == (const iterator &rhs_) const
  53. {
  54. return _data == rhs_._data;
  55. }
  56. bool operator != (const iterator &rhs_) const
  57. {
  58. return !(*this == rhs_);
  59. }
  60. data &operator * ()
  61. {
  62. return _data;
  63. }
  64. data *operator -> ()
  65. {
  66. return &_data;
  67. }
  68. // Let compiler generate operator = ().
  69. // prefix version
  70. iterator &operator ++ ()
  71. {
  72. next_token ();
  73. return *this;
  74. }
  75. // postfix version
  76. iterator operator ++ (int)
  77. {
  78. iterator iter_ = *this;
  79. next_token ();
  80. return iter_;
  81. }
  82. private:
  83. // Not owner (obviously!)
  84. const basic_input *_input;
  85. data _data;
  86. void next_token ()
  87. {
  88. const detail::internals &internals_ =
  89. _input->_state_machine->data ();
  90. _data.start = _data.end;
  91. if (internals_._dfa->size () == 1)
  92. {
  93. if (internals_._seen_BOL_assertion ||
  94. internals_._seen_EOL_assertion)
  95. {
  96. _data.id = next
  97. (&internals_._lookup->front ()->front (),
  98. internals_._dfa_alphabet.front (),
  99. &internals_._dfa->front ()->front (),
  100. _data.bol, _data.end, _input->_end, _data.unique_id);
  101. }
  102. else
  103. {
  104. _data.id = next (&internals_._lookup->front ()->front (),
  105. internals_._dfa_alphabet.front (), &internals_.
  106. _dfa->front ()->front (), _data.end, _input->_end,
  107. _data.unique_id);
  108. }
  109. }
  110. else
  111. {
  112. if (internals_._seen_BOL_assertion ||
  113. internals_._seen_EOL_assertion)
  114. {
  115. _data.id = next (internals_, _data.state,
  116. _data.bol, _data.end, _input->_end, _data.unique_id);
  117. }
  118. else
  119. {
  120. _data.id = next (internals_, _data.state,
  121. _data.end, _input->_end, _data.unique_id);
  122. }
  123. }
  124. if (_data.end == _input->_end && _data.start == _data.end)
  125. {
  126. // Ensure current state matches that returned by end().
  127. _data.state = npos;
  128. }
  129. }
  130. std::size_t next (const detail::internals &internals_,
  131. std::size_t &start_state_, bool bol_,
  132. FwdIter &start_token_, const FwdIter &end_,
  133. std::size_t &unique_id_)
  134. {
  135. if (start_token_ == end_)
  136. {
  137. unique_id_ = npos;
  138. return 0;
  139. }
  140. again:
  141. const std::size_t * lookup_ = &internals_._lookup[start_state_]->
  142. front ();
  143. std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_];
  144. const std::size_t *dfa_ = &internals_._dfa[start_state_]->front ();
  145. const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
  146. FwdIter curr_ = start_token_;
  147. bool end_state_ = *ptr_ != 0;
  148. std::size_t id_ = *(ptr_ + id_index);
  149. std::size_t uid_ = *(ptr_ + unique_id_index);
  150. std::size_t end_start_state_ = start_state_;
  151. bool end_bol_ = bol_;
  152. FwdIter end_token_ = start_token_;
  153. while (curr_ != end_)
  154. {
  155. const std::size_t BOL_state_ = ptr_[bol_index];
  156. const std::size_t EOL_state_ = ptr_[eol_index];
  157. if (BOL_state_ && bol_)
  158. {
  159. ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
  160. }
  161. else if (EOL_state_ && *curr_ == '\n')
  162. {
  163. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  164. }
  165. else
  166. {
  167. typename Traits::char_type prev_char_ = *curr_++;
  168. bol_ = prev_char_ == '\n';
  169. const std::size_t state_ =
  170. ptr_[lookup_[static_cast<typename Traits::index_type>
  171. (prev_char_)]];
  172. if (state_ == 0)
  173. {
  174. break;
  175. }
  176. ptr_ = &dfa_[state_ * dfa_alphabet_];
  177. }
  178. if (*ptr_)
  179. {
  180. end_state_ = true;
  181. id_ = *(ptr_ + id_index);
  182. uid_ = *(ptr_ + unique_id_index);
  183. end_start_state_ = *(ptr_ + state_index);
  184. end_bol_ = bol_;
  185. end_token_ = curr_;
  186. }
  187. }
  188. const std::size_t EOL_state_ = ptr_[eol_index];
  189. if (EOL_state_ && curr_ == end_)
  190. {
  191. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  192. if (*ptr_)
  193. {
  194. end_state_ = true;
  195. id_ = *(ptr_ + id_index);
  196. uid_ = *(ptr_ + unique_id_index);
  197. end_start_state_ = *(ptr_ + state_index);
  198. end_bol_ = bol_;
  199. end_token_ = curr_;
  200. }
  201. }
  202. if (end_state_)
  203. {
  204. // return longest match
  205. start_state_ = end_start_state_;
  206. start_token_ = end_token_;
  207. if (id_ == 0)
  208. {
  209. bol_ = end_bol_;
  210. goto again;
  211. }
  212. else
  213. {
  214. _data.bol = end_bol_;
  215. }
  216. }
  217. else
  218. {
  219. // No match causes char to be skipped
  220. _data.bol = *start_token_ == '\n';
  221. ++start_token_;
  222. id_ = npos;
  223. uid_ = npos;
  224. }
  225. unique_id_ = uid_;
  226. return id_;
  227. }
  228. std::size_t next (const detail::internals &internals_,
  229. std::size_t &start_state_, FwdIter &start_token_,
  230. FwdIter const &end_, std::size_t &unique_id_)
  231. {
  232. if (start_token_ == end_)
  233. {
  234. unique_id_ = npos;
  235. return 0;
  236. }
  237. again:
  238. const std::size_t * lookup_ = &internals_._lookup[start_state_]->
  239. front ();
  240. std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_];
  241. const std::size_t *dfa_ = &internals_._dfa[start_state_]->front ();
  242. const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
  243. FwdIter curr_ = start_token_;
  244. bool end_state_ = *ptr_ != 0;
  245. std::size_t id_ = *(ptr_ + id_index);
  246. std::size_t uid_ = *(ptr_ + unique_id_index);
  247. std::size_t end_start_state_ = start_state_;
  248. FwdIter end_token_ = start_token_;
  249. while (curr_ != end_)
  250. {
  251. const std::size_t state_ = ptr_[lookup_[static_cast
  252. <typename Traits::index_type>(*curr_++)]];
  253. if (state_ == 0)
  254. {
  255. break;
  256. }
  257. ptr_ = &dfa_[state_ * dfa_alphabet_];
  258. if (*ptr_)
  259. {
  260. end_state_ = true;
  261. id_ = *(ptr_ + id_index);
  262. uid_ = *(ptr_ + unique_id_index);
  263. end_start_state_ = *(ptr_ + state_index);
  264. end_token_ = curr_;
  265. }
  266. }
  267. if (end_state_)
  268. {
  269. // return longest match
  270. start_state_ = end_start_state_;
  271. start_token_ = end_token_;
  272. if (id_ == 0) goto again;
  273. }
  274. else
  275. {
  276. // No match causes char to be skipped
  277. ++start_token_;
  278. id_ = npos;
  279. uid_ = npos;
  280. }
  281. unique_id_ = uid_;
  282. return id_;
  283. }
  284. std::size_t next (const std::size_t * const lookup_,
  285. const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
  286. bool bol_, FwdIter &start_token_, FwdIter const &end_,
  287. std::size_t &unique_id_)
  288. {
  289. if (start_token_ == end_)
  290. {
  291. unique_id_ = npos;
  292. return 0;
  293. }
  294. const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
  295. FwdIter curr_ = start_token_;
  296. bool end_state_ = *ptr_ != 0;
  297. std::size_t id_ = *(ptr_ + id_index);
  298. std::size_t uid_ = *(ptr_ + unique_id_index);
  299. bool end_bol_ = bol_;
  300. FwdIter end_token_ = start_token_;
  301. while (curr_ != end_)
  302. {
  303. const std::size_t BOL_state_ = ptr_[bol_index];
  304. const std::size_t EOL_state_ = ptr_[eol_index];
  305. if (BOL_state_ && bol_)
  306. {
  307. ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
  308. }
  309. else if (EOL_state_ && *curr_ == '\n')
  310. {
  311. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  312. }
  313. else
  314. {
  315. typename Traits::char_type prev_char_ = *curr_++;
  316. bol_ = prev_char_ == '\n';
  317. const std::size_t state_ =
  318. ptr_[lookup_[static_cast<typename Traits::index_type>
  319. (prev_char_)]];
  320. if (state_ == 0)
  321. {
  322. break;
  323. }
  324. ptr_ = &dfa_[state_ * dfa_alphabet_];
  325. }
  326. if (*ptr_)
  327. {
  328. end_state_ = true;
  329. id_ = *(ptr_ + id_index);
  330. uid_ = *(ptr_ + unique_id_index);
  331. end_bol_ = bol_;
  332. end_token_ = curr_;
  333. }
  334. }
  335. const std::size_t EOL_state_ = ptr_[eol_index];
  336. if (EOL_state_ && curr_ == end_)
  337. {
  338. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  339. if (*ptr_)
  340. {
  341. end_state_ = true;
  342. id_ = *(ptr_ + id_index);
  343. uid_ = *(ptr_ + unique_id_index);
  344. end_bol_ = bol_;
  345. end_token_ = curr_;
  346. }
  347. }
  348. if (end_state_)
  349. {
  350. // return longest match
  351. _data.bol = end_bol_;
  352. start_token_ = end_token_;
  353. }
  354. else
  355. {
  356. // No match causes char to be skipped
  357. _data.bol = *start_token_ == '\n';
  358. ++start_token_;
  359. id_ = npos;
  360. uid_ = npos;
  361. }
  362. unique_id_ = uid_;
  363. return id_;
  364. }
  365. std::size_t next (const std::size_t * const lookup_,
  366. const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
  367. FwdIter &start_token_, FwdIter const &end_,
  368. std::size_t &unique_id_)
  369. {
  370. if (start_token_ == end_)
  371. {
  372. unique_id_ = npos;
  373. return 0;
  374. }
  375. const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
  376. FwdIter curr_ = start_token_;
  377. bool end_state_ = *ptr_ != 0;
  378. std::size_t id_ = *(ptr_ + id_index);
  379. std::size_t uid_ = *(ptr_ + unique_id_index);
  380. FwdIter end_token_ = start_token_;
  381. while (curr_ != end_)
  382. {
  383. const std::size_t state_ = ptr_[lookup_[static_cast
  384. <typename Traits::index_type>(*curr_++)]];
  385. if (state_ == 0)
  386. {
  387. break;
  388. }
  389. ptr_ = &dfa_[state_ * dfa_alphabet_];
  390. if (*ptr_)
  391. {
  392. end_state_ = true;
  393. id_ = *(ptr_ + id_index);
  394. uid_ = *(ptr_ + unique_id_index);
  395. end_token_ = curr_;
  396. }
  397. }
  398. if (end_state_)
  399. {
  400. // return longest match
  401. start_token_ = end_token_;
  402. }
  403. else
  404. {
  405. // No match causes char to be skipped
  406. ++start_token_;
  407. id_ = npos;
  408. uid_ = npos;
  409. }
  410. unique_id_ = uid_;
  411. return id_;
  412. }
  413. };
  414. friend class iterator;
  415. // Make it explict that we are NOT taking a copy of state_machine_!
  416. basic_input (const basic_state_machine<typename Traits::char_type>
  417. *state_machine_, const FwdIter &begin_, const FwdIter &end_) :
  418. _state_machine (state_machine_),
  419. _begin (begin_),
  420. _end (end_)
  421. {
  422. }
  423. iterator begin () const
  424. {
  425. iterator iter_;
  426. iter_._input = this;
  427. // Over-ride default of 0 (EOI)
  428. iter_._data.id = npos;
  429. iter_._data.start = _begin;
  430. iter_._data.end = _begin;
  431. iter_._data.bol = _state_machine->data ()._seen_BOL_assertion;
  432. iter_._data.state = 0;
  433. ++iter_;
  434. return iter_;
  435. }
  436. iterator end () const
  437. {
  438. iterator iter_;
  439. iter_._input = this;
  440. iter_._data.start = _end;
  441. iter_._data.end = _end;
  442. return iter_;
  443. }
  444. private:
  445. const basic_state_machine<typename Traits::char_type> *_state_machine;
  446. FwdIter _begin;
  447. FwdIter _end;
  448. };
  449. typedef basic_input<std::string::iterator> iter_input;
  450. typedef basic_input<std::basic_string<wchar_t>::iterator> iter_winput;
  451. typedef basic_input<const char *> ptr_input;
  452. typedef basic_input<const wchar_t *> ptr_winput;
  453. }
  454. }
  455. #endif