file_input.hpp 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475
  1. // file_input.hpp
  2. // Copyright (c) 2008-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_FILE_INPUT
  7. #define BOOST_LEXER_FILE_INPUT
  8. #include "char_traits.hpp"
  9. // memcpy
  10. #include <cstring>
  11. #include <fstream>
  12. #include "size_t.hpp"
  13. #include "state_machine.hpp"
  14. namespace boost
  15. {
  16. namespace lexer
  17. {
  18. template<typename CharT, typename Traits = char_traits<CharT> >
  19. class basic_file_input
  20. {
  21. public:
  22. class iterator
  23. {
  24. public:
  25. friend class basic_file_input;
  26. struct data
  27. {
  28. std::size_t id;
  29. std::size_t unique_id;
  30. const CharT *start;
  31. const CharT *end;
  32. std::size_t state;
  33. // Construct in end() state.
  34. data () :
  35. id (0),
  36. unique_id (npos),
  37. state (npos)
  38. {
  39. }
  40. bool operator == (const data &rhs_) const
  41. {
  42. return id == rhs_.id && unique_id == rhs_.unique_id &&
  43. start == rhs_.start && end == rhs_.end &&
  44. state == rhs_.state;
  45. }
  46. };
  47. iterator () :
  48. _input (0)
  49. {
  50. }
  51. bool operator == (const iterator &rhs_) const
  52. {
  53. return _data == rhs_._data;
  54. }
  55. bool operator != (const iterator &rhs_) const
  56. {
  57. return !(*this == rhs_);
  58. }
  59. data &operator * ()
  60. {
  61. return _data;
  62. }
  63. data *operator -> ()
  64. {
  65. return &_data;
  66. }
  67. // Let compiler generate operator = ().
  68. // prefix version
  69. iterator &operator ++ ()
  70. {
  71. next_token ();
  72. return *this;
  73. }
  74. // postfix version
  75. iterator operator ++ (int)
  76. {
  77. iterator iter_ = *this;
  78. next_token ();
  79. return iter_;
  80. }
  81. void next_token ()
  82. {
  83. const detail::internals &internals_ =
  84. _input->_state_machine->data ();
  85. _data.start = _data.end;
  86. if (internals_._dfa->size () == 1)
  87. {
  88. _data.id = _input->next (&internals_._lookup->front ()->
  89. front (), internals_._dfa_alphabet.front (),
  90. &internals_._dfa->front ()->front (), _data.start,
  91. _data.end, _data.unique_id);
  92. }
  93. else
  94. {
  95. _data.id = _input->next (internals_, _data.state, _data.start,
  96. _data.end, _data.unique_id);
  97. }
  98. if (_data.id == 0)
  99. {
  100. _data.start = 0;
  101. _data.end = 0;
  102. // Ensure current state matches that returned by end().
  103. _data.state = npos;
  104. }
  105. }
  106. private:
  107. // Not owner (obviously!)
  108. basic_file_input *_input;
  109. data _data;
  110. };
  111. friend class iterator;
  112. // Make it explict that we are NOT taking a copy of state_machine_!
  113. basic_file_input (const basic_state_machine<CharT> *state_machine_,
  114. std::basic_ifstream<CharT> *is_,
  115. const std::streamsize buffer_size_ = 4096,
  116. const std::streamsize buffer_increment_ = 1024) :
  117. _state_machine (state_machine_),
  118. _stream (is_),
  119. _buffer_size (buffer_size_),
  120. _buffer_increment (buffer_increment_),
  121. _buffer (_buffer_size, '!')
  122. {
  123. _start_buffer = &_buffer.front ();
  124. _end_buffer = _start_buffer + _buffer.size ();
  125. _start_token = _end_buffer;
  126. _end_token = _end_buffer;
  127. }
  128. iterator begin ()
  129. {
  130. iterator iter_;
  131. iter_._input = this;
  132. // Over-ride default of 0 (EOF)
  133. iter_._data.id = npos;
  134. iter_._data.start = 0;
  135. iter_._data.end = 0;
  136. iter_._data.state = 0;
  137. ++iter_;
  138. return iter_;
  139. }
  140. iterator end ()
  141. {
  142. iterator iter_;
  143. iter_._input = this;
  144. iter_._data.start = 0;
  145. iter_._data.end = 0;
  146. return iter_;
  147. }
  148. void flush ()
  149. {
  150. // This temporary is mandatory, otherwise the
  151. // pointer calculations won't work!
  152. const CharT *temp_ = _end_buffer;
  153. _start_token = _end_token = _end_buffer;
  154. reload_buffer (temp_, true, _end_token);
  155. }
  156. private:
  157. typedef std::basic_istream<CharT> istream;
  158. typedef std::vector<CharT> buffer;
  159. const basic_state_machine<CharT> *_state_machine;
  160. const std::streamsize _buffer_size;
  161. const std::streamsize _buffer_increment;
  162. buffer _buffer;
  163. CharT *_start_buffer;
  164. istream *_stream;
  165. const CharT *_start_token;
  166. const CharT *_end_token;
  167. CharT *_end_buffer;
  168. std::size_t next (const detail::internals &internals_,
  169. std::size_t &start_state_, const CharT * &start_, const CharT * &end_,
  170. std::size_t &unique_id_)
  171. {
  172. _start_token = _end_token;
  173. again:
  174. const std::size_t * lookup_ = &internals_._lookup[start_state_]->
  175. front ();
  176. std::size_t dfa_alphabet_ = internals_._dfa_alphabet[start_state_];
  177. const std::size_t *dfa_ = &internals_._dfa[start_state_]->front ();
  178. const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
  179. const CharT *curr_ = _start_token;
  180. bool end_state_ = *ptr_ != 0;
  181. std::size_t id_ = *(ptr_ + id_index);
  182. std::size_t uid_ = *(ptr_ + unique_id_index);
  183. const CharT *end_token_ = curr_;
  184. for (;;)
  185. {
  186. if (curr_ >= _end_buffer)
  187. {
  188. if (!reload_buffer (curr_, end_state_, end_token_))
  189. {
  190. // EOF
  191. break;
  192. }
  193. }
  194. const std::size_t BOL_state_ = ptr_[bol_index];
  195. const std::size_t EOL_state_ = ptr_[eol_index];
  196. if (BOL_state_ && (_start_token == _start_buffer ||
  197. *(_start_token - 1) == '\n'))
  198. {
  199. ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
  200. }
  201. else if (EOL_state_ && *curr_ == '\n')
  202. {
  203. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  204. }
  205. else
  206. {
  207. const std::size_t state_ =
  208. ptr_[lookup_[static_cast<typename Traits::index_type>
  209. (*curr_++)]];
  210. if (state_ == 0)
  211. {
  212. break;
  213. }
  214. ptr_ = &dfa_[state_ * dfa_alphabet_];
  215. }
  216. if (*ptr_)
  217. {
  218. end_state_ = true;
  219. id_ = *(ptr_ + id_index);
  220. uid_ = *(ptr_ + unique_id_index);
  221. start_state_ = *(ptr_ + state_index);
  222. end_token_ = curr_;
  223. }
  224. }
  225. if (_start_token >= _end_buffer)
  226. {
  227. // No more tokens...
  228. unique_id_ = npos;
  229. return 0;
  230. }
  231. const std::size_t EOL_state_ = ptr_[eol_index];
  232. if (EOL_state_ && curr_ == end_)
  233. {
  234. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  235. if (*ptr_)
  236. {
  237. end_state_ = true;
  238. id_ = *(ptr_ + id_index);
  239. uid_ = *(ptr_ + unique_id_index);
  240. start_state_ = *(ptr_ + state_index);
  241. end_token_ = curr_;
  242. }
  243. }
  244. if (end_state_)
  245. {
  246. // return longest match
  247. _end_token = end_token_;
  248. if (id_ == 0) goto again;
  249. }
  250. else
  251. {
  252. // No match causes char to be skipped
  253. _end_token = _start_token + 1;
  254. id_ = npos;
  255. uid_ = npos;
  256. }
  257. start_ = _start_token;
  258. end_ = _end_token;
  259. unique_id_ = uid_;
  260. return id_;
  261. }
  262. std::size_t next (const std::size_t * const lookup_,
  263. const std::size_t dfa_alphabet_, const std::size_t * const dfa_,
  264. const CharT * &start_, const CharT * &end_, std::size_t &unique_id_)
  265. {
  266. _start_token = _end_token;
  267. const std::size_t *ptr_ = dfa_ + dfa_alphabet_;
  268. const CharT *curr_ = _start_token;
  269. bool end_state_ = *ptr_ != 0;
  270. std::size_t id_ = *(ptr_ + id_index);
  271. std::size_t uid_ = *(ptr_ + unique_id_index);
  272. const CharT *end_token_ = curr_;
  273. for (;;)
  274. {
  275. if (curr_ >= _end_buffer)
  276. {
  277. if (!reload_buffer (curr_, end_state_, end_token_))
  278. {
  279. // EOF
  280. break;
  281. }
  282. }
  283. const std::size_t BOL_state_ = ptr_[bol_index];
  284. const std::size_t EOL_state_ = ptr_[eol_index];
  285. if (BOL_state_ && (_start_token == _start_buffer ||
  286. *(_start_token - 1) == '\n'))
  287. {
  288. ptr_ = &dfa_[BOL_state_ * dfa_alphabet_];
  289. }
  290. else if (EOL_state_ && *curr_ == '\n')
  291. {
  292. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  293. }
  294. else
  295. {
  296. const std::size_t state_ =
  297. ptr_[lookup_[static_cast<typename Traits::index_type>
  298. (*curr_++)]];
  299. if (state_ == 0)
  300. {
  301. break;
  302. }
  303. ptr_ = &dfa_[state_ * dfa_alphabet_];
  304. }
  305. if (*ptr_)
  306. {
  307. end_state_ = true;
  308. id_ = *(ptr_ + id_index);
  309. uid_ = *(ptr_ + unique_id_index);
  310. end_token_ = curr_;
  311. }
  312. }
  313. if (_start_token >= _end_buffer)
  314. {
  315. // No more tokens...
  316. unique_id_ = npos;
  317. return 0;
  318. }
  319. const std::size_t EOL_state_ = ptr_[eol_index];
  320. if (EOL_state_ && curr_ == end_)
  321. {
  322. ptr_ = &dfa_[EOL_state_ * dfa_alphabet_];
  323. if (*ptr_)
  324. {
  325. end_state_ = true;
  326. id_ = *(ptr_ + id_index);
  327. uid_ = *(ptr_ + unique_id_index);
  328. end_token_ = curr_;
  329. }
  330. }
  331. if (end_state_)
  332. {
  333. // return longest match
  334. _end_token = end_token_;
  335. }
  336. else
  337. {
  338. // No match causes char to be skipped
  339. _end_token = _start_token + 1;
  340. id_ = npos;
  341. uid_ = npos;
  342. }
  343. start_ = _start_token;
  344. end_ = _end_token;
  345. unique_id_ = uid_;
  346. return id_;
  347. }
  348. bool reload_buffer (const CharT * &curr_, const bool end_state_,
  349. const CharT * &end_token_)
  350. {
  351. bool success_ = !_stream->eof ();
  352. if (success_)
  353. {
  354. const CharT *old_start_token_ = _start_token;
  355. std::size_t old_size_ = _buffer.size ();
  356. std::size_t count_ = 0;
  357. if (_start_token - 1 == _start_buffer)
  358. {
  359. // Run out of buffer space, so increase.
  360. _buffer.resize (old_size_ + _buffer_increment, '!');
  361. _start_buffer = &_buffer.front ();
  362. _start_token = _start_buffer + 1;
  363. _stream->read (_start_buffer + old_size_,
  364. _buffer_increment);
  365. count_ = _stream->gcount ();
  366. _end_buffer = _start_buffer + old_size_ + count_;
  367. }
  368. else if (_start_token < _end_buffer)
  369. {
  370. const std::size_t len_ = _end_buffer - _start_token;
  371. // Some systems have memcpy in namespace std.
  372. using namespace std;
  373. memcpy (_start_buffer, _start_token - 1, (len_ + 1) *
  374. sizeof (CharT));
  375. _stream->read (_start_buffer + len_ + 1,
  376. static_cast<std::streamsize> (_buffer.size () - len_ - 1));
  377. count_ = _stream->gcount ();
  378. _start_token = _start_buffer + 1;
  379. _end_buffer = _start_buffer + len_ + 1 + count_;
  380. }
  381. else
  382. {
  383. _stream->read (_start_buffer, static_cast<std::streamsize>
  384. (_buffer.size ()));
  385. count_ = _stream->gcount ();
  386. _start_token = _start_buffer;
  387. _end_buffer = _start_buffer + count_;
  388. }
  389. if (end_state_)
  390. {
  391. end_token_ = _start_token +
  392. (end_token_ - old_start_token_);
  393. }
  394. curr_ = _start_token + (curr_ - old_start_token_);
  395. }
  396. return success_;
  397. }
  398. // Disallow copying of buffer
  399. basic_file_input (const basic_file_input &);
  400. const basic_file_input &operator = (const basic_file_input &);
  401. };
  402. typedef basic_file_input<char> file_input;
  403. typedef basic_file_input<wchar_t> wfile_input;
  404. }
  405. }
  406. #endif