lexer.hpp 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410
  1. // Copyright (c) 2001-2011 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. #if !defined(BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM)
  6. #define BOOST_SPIRIT_LEX_LEXER_MAR_13_2007_0145PM
  7. #if defined(_MSC_VER)
  8. #pragma once
  9. #endif
  10. #include <boost/spirit/home/support/info.hpp>
  11. #include <boost/spirit/home/qi/skip_over.hpp>
  12. #include <boost/spirit/home/qi/parser.hpp>
  13. #include <boost/spirit/home/qi/detail/assign_to.hpp>
  14. #include <boost/spirit/home/lex/reference.hpp>
  15. #include <boost/spirit/home/lex/meta_compiler.hpp>
  16. #include <boost/spirit/home/lex/lexer_type.hpp>
  17. #include <boost/spirit/home/lex/lexer/token_def.hpp>
  18. #include <boost/assert.hpp>
  19. #include <boost/noncopyable.hpp>
  20. #include <boost/fusion/include/vector.hpp>
  21. #include <boost/mpl/assert.hpp>
  22. #include <boost/range/iterator_range.hpp>
  23. #include <iterator> // for std::iterator_traits
  24. #include <string>
  25. namespace boost { namespace spirit { namespace lex
  26. {
  27. ///////////////////////////////////////////////////////////////////////////
  28. namespace detail
  29. {
  30. ///////////////////////////////////////////////////////////////////////
  31. template <typename LexerDef>
  32. struct lexer_def_
  33. : proto::extends<
  34. typename proto::terminal<
  35. lex::reference<lexer_def_<LexerDef> const>
  36. >::type
  37. , lexer_def_<LexerDef> >
  38. , qi::parser<lexer_def_<LexerDef> >
  39. , lex::lexer_type<lexer_def_<LexerDef> >
  40. {
  41. private:
  42. // avoid warnings about using 'this' in constructor
  43. lexer_def_& this_() { return *this; }
  44. typedef typename LexerDef::char_type char_type;
  45. typedef typename LexerDef::string_type string_type;
  46. typedef typename LexerDef::id_type id_type;
  47. typedef lex::reference<lexer_def_ const> reference_;
  48. typedef typename proto::terminal<reference_>::type terminal_type;
  49. typedef proto::extends<terminal_type, lexer_def_> proto_base_type;
  50. reference_ alias() const
  51. {
  52. return reference_(*this);
  53. }
  54. public:
  55. // Qi interface: metafunction calculating parser attribute type
  56. template <typename Context, typename Iterator>
  57. struct attribute
  58. {
  59. // the return value of a token set contains the matched token
  60. // id, and the corresponding pair of iterators
  61. typedef typename Iterator::base_iterator_type iterator_type;
  62. typedef
  63. fusion::vector2<id_type, iterator_range<iterator_type> >
  64. type;
  65. };
  66. // Qi interface: parse functionality
  67. template <typename Iterator, typename Context
  68. , typename Skipper, typename Attribute>
  69. bool parse(Iterator& first, Iterator const& last
  70. , Context& /*context*/, Skipper const& skipper
  71. , Attribute& attr) const
  72. {
  73. qi::skip_over(first, last, skipper); // always do a pre-skip
  74. if (first != last) {
  75. typedef typename
  76. std::iterator_traits<Iterator>::value_type
  77. token_type;
  78. token_type const& t = *first;
  79. if (token_is_valid(t) && t.state() == first.get_state()) {
  80. // any of the token definitions matched
  81. spirit::traits::assign_to(t, attr);
  82. ++first;
  83. return true;
  84. }
  85. }
  86. return false;
  87. }
  88. // Qi interface: 'what' functionality
  89. template <typename Context>
  90. info what(Context& /*context*/) const
  91. {
  92. return info("lexer");
  93. }
  94. private:
  95. // allow to use the lexer.self.add("regex1", id1)("regex2", id2);
  96. // syntax
  97. struct adder
  98. {
  99. adder(lexer_def_& def_)
  100. : def(def_) {}
  101. // Add a token definition based on a single character as given
  102. // by the first parameter, the second parameter allows to
  103. // specify the token id to use for the new token. If no token
  104. // id is given the character code is used.
  105. adder const& operator()(char_type c
  106. , id_type token_id = id_type()) const
  107. {
  108. if (id_type() == token_id)
  109. token_id = static_cast<id_type>(c);
  110. def.def.add_token (def.state.c_str(), c, token_id
  111. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  112. return *this;
  113. }
  114. // Add a token definition based on a character sequence as
  115. // given by the first parameter, the second parameter allows to
  116. // specify the token id to use for the new token. If no token
  117. // id is given this function will generate a unique id to be
  118. // used as the token's id.
  119. adder const& operator()(string_type const& s
  120. , id_type token_id = id_type()) const
  121. {
  122. if (id_type() == token_id)
  123. token_id = def.def.get_next_id();
  124. def.def.add_token (def.state.c_str(), s, token_id
  125. , def.targetstate.empty() ? 0 : def.targetstate.c_str());
  126. return *this;
  127. }
  128. template <typename Attribute>
  129. adder const& operator()(
  130. token_def<Attribute, char_type, id_type>& tokdef
  131. , id_type token_id = id_type()) const
  132. {
  133. // make sure we have a token id
  134. if (id_type() == token_id) {
  135. if (id_type() == tokdef.id()) {
  136. token_id = def.def.get_next_id();
  137. tokdef.id(token_id);
  138. }
  139. else {
  140. token_id = tokdef.id();
  141. }
  142. }
  143. else {
  144. // the following assertion makes sure that the token_def
  145. // instance has not been assigned a different id earlier
  146. BOOST_ASSERT(id_type() == tokdef.id()
  147. || token_id == tokdef.id());
  148. tokdef.id(token_id);
  149. }
  150. def.define(tokdef);
  151. return *this;
  152. }
  153. // template <typename F>
  154. // adder const& operator()(char_type c, id_type token_id, F act) const
  155. // {
  156. // if (id_type() == token_id)
  157. // token_id = def.def.get_next_id();
  158. // std::size_t unique_id =
  159. // def.def.add_token (def.state.c_str(), s, token_id);
  160. // def.def.add_action(unique_id, def.state.c_str(), act);
  161. // return *this;
  162. // }
  163. lexer_def_& def;
  164. // silence MSVC warning C4512: assignment operator could not be generated
  165. BOOST_DELETED_FUNCTION(adder& operator= (adder const&))
  166. };
  167. friend struct adder;
  168. // allow to use lexer.self.add_pattern("pattern1", "regex1")(...);
  169. // syntax
  170. struct pattern_adder
  171. {
  172. pattern_adder(lexer_def_& def_)
  173. : def(def_) {}
  174. pattern_adder const& operator()(string_type const& p
  175. , string_type const& s) const
  176. {
  177. def.def.add_pattern (def.state.c_str(), p, s);
  178. return *this;
  179. }
  180. lexer_def_& def;
  181. // silence MSVC warning C4512: assignment operator could not be generated
  182. BOOST_DELETED_FUNCTION(pattern_adder& operator= (pattern_adder const&))
  183. };
  184. friend struct pattern_adder;
  185. private:
  186. // Helper function to invoke the necessary 2 step compilation
  187. // process on token definition expressions
  188. template <typename TokenExpr>
  189. void compile2pass(TokenExpr const& expr)
  190. {
  191. expr.collect(def, state, targetstate);
  192. expr.add_actions(def);
  193. }
  194. public:
  195. ///////////////////////////////////////////////////////////////////
  196. template <typename Expr>
  197. void define(Expr const& expr)
  198. {
  199. compile2pass(compile<lex::domain>(expr));
  200. }
  201. lexer_def_(LexerDef& def_, string_type const& state_
  202. , string_type const& targetstate_ = string_type())
  203. : proto_base_type(terminal_type::make(alias()))
  204. , add(this_()), add_pattern(this_()), def(def_)
  205. , state(state_), targetstate(targetstate_)
  206. {}
  207. // allow to switch states
  208. lexer_def_ operator()(char_type const* state) const
  209. {
  210. return lexer_def_(def, state);
  211. }
  212. lexer_def_ operator()(char_type const* state
  213. , char_type const* targetstate) const
  214. {
  215. return lexer_def_(def, state, targetstate);
  216. }
  217. lexer_def_ operator()(string_type const& state
  218. , string_type const& targetstate = string_type()) const
  219. {
  220. return lexer_def_(def, state, targetstate);
  221. }
  222. // allow to assign a token definition expression
  223. template <typename Expr>
  224. lexer_def_& operator= (Expr const& xpr)
  225. {
  226. // Report invalid expression error as early as possible.
  227. // If you got an error_invalid_expression error message here,
  228. // then the expression (expr) is not a valid spirit lex
  229. // expression.
  230. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  231. def.clear(state.c_str());
  232. define(xpr);
  233. return *this;
  234. }
  235. // explicitly tell the lexer that the given state will be defined
  236. // (useful in conjunction with "*")
  237. std::size_t add_state(char_type const* state = 0)
  238. {
  239. return def.add_state(state ? state : def.initial_state().c_str());
  240. }
  241. adder add;
  242. pattern_adder add_pattern;
  243. private:
  244. LexerDef& def;
  245. string_type state;
  246. string_type targetstate;
  247. // silence MSVC warning C4512: assignment operator could not be generated
  248. BOOST_DELETED_FUNCTION(lexer_def_& operator= (lexer_def_ const&))
  249. };
  250. #if defined(BOOST_NO_CXX11_RVALUE_REFERENCES)
  251. // allow to assign a token definition expression
  252. template <typename LexerDef, typename Expr>
  253. inline lexer_def_<LexerDef>&
  254. operator+= (lexer_def_<LexerDef>& lexdef, Expr& xpr)
  255. {
  256. // Report invalid expression error as early as possible.
  257. // If you got an error_invalid_expression error message here,
  258. // then the expression (expr) is not a valid spirit lex
  259. // expression.
  260. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  261. lexdef.define(xpr);
  262. return lexdef;
  263. }
  264. #else
  265. // allow to assign a token definition expression
  266. template <typename LexerDef, typename Expr>
  267. inline lexer_def_<LexerDef>&
  268. operator+= (lexer_def_<LexerDef>& lexdef, Expr&& xpr)
  269. {
  270. // Report invalid expression error as early as possible.
  271. // If you got an error_invalid_expression error message here,
  272. // then the expression (expr) is not a valid spirit lex
  273. // expression.
  274. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  275. lexdef.define(xpr);
  276. return lexdef;
  277. }
  278. #endif
  279. template <typename LexerDef, typename Expr>
  280. inline lexer_def_<LexerDef>&
  281. operator+= (lexer_def_<LexerDef>& lexdef, Expr const& xpr)
  282. {
  283. // Report invalid expression error as early as possible.
  284. // If you got an error_invalid_expression error message here,
  285. // then the expression (expr) is not a valid spirit lex
  286. // expression.
  287. BOOST_SPIRIT_ASSERT_MATCH(lex::domain, Expr);
  288. lexdef.define(xpr);
  289. return lexdef;
  290. }
  291. }
  292. ///////////////////////////////////////////////////////////////////////////
  293. // The match_flags flags are used to influence different matching
  294. // modes of the lexer
  295. struct match_flags
  296. {
  297. enum enum_type
  298. {
  299. match_default = 0, // no flags
  300. match_not_dot_newline = 1, // the regex '.' doesn't match newlines
  301. match_icase = 2 // all matching operations are case insensitive
  302. };
  303. };
  304. ///////////////////////////////////////////////////////////////////////////
  305. // This represents a lexer object
  306. ///////////////////////////////////////////////////////////////////////////
  307. ///////////////////////////////////////////////////////////////////////////
  308. // This is the first token id automatically assigned by the library
  309. // if needed
  310. enum tokenids
  311. {
  312. min_token_id = 0x10000
  313. };
  314. template <typename Lexer>
  315. class lexer : public Lexer
  316. {
  317. private:
  318. // avoid warnings about using 'this' in constructor
  319. lexer& this_() { return *this; }
  320. std::size_t next_token_id; // has to be an integral type
  321. public:
  322. typedef Lexer lexer_type;
  323. typedef typename Lexer::id_type id_type;
  324. typedef typename Lexer::char_type char_type;
  325. typedef typename Lexer::iterator_type iterator_type;
  326. typedef lexer base_type;
  327. typedef detail::lexer_def_<lexer> lexer_def;
  328. typedef std::basic_string<char_type> string_type;
  329. // if `id_type` was specified but `first_id` is not provided
  330. // the `min_token_id` value may be out of range for `id_type`,
  331. // but it will be a problem only if unique ids feature is in use.
  332. lexer(unsigned int flags = match_flags::match_default)
  333. : lexer_type(flags)
  334. , next_token_id(min_token_id)
  335. , self(this_(), lexer_type::initial_state())
  336. {}
  337. lexer(unsigned int flags, id_type first_id)
  338. : lexer_type(flags)
  339. , next_token_id(first_id)
  340. , self(this_(), lexer_type::initial_state())
  341. {}
  342. // access iterator interface
  343. template <typename Iterator>
  344. iterator_type begin(Iterator& first, Iterator const& last
  345. , char_type const* initial_state = 0) const
  346. { return this->lexer_type::begin(first, last, initial_state); }
  347. iterator_type end() const
  348. { return this->lexer_type::end(); }
  349. std::size_t map_state(char_type const* state)
  350. { return this->lexer_type::add_state(state); }
  351. // create a unique token id
  352. id_type get_next_id() { return id_type(next_token_id++); }
  353. lexer_def self; // allow for easy token definition
  354. };
  355. }}}
  356. #endif