parse_charset.hpp 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368
  1. ///////////////////////////////////////////////////////////////////////////////
  2. // parse_charset.hpp
  3. //
  4. // Copyright 2008 Eric Niebler. Distributed under the Boost
  5. // Software License, Version 1.0. (See accompanying file
  6. // LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  7. #ifndef BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
  8. #define BOOST_XPRESSIVE_DETAIL_DYNAMIC_PARSE_CHARSET_HPP_EAN_10_04_2005
  9. // MS compatible compilers support #pragma once
  10. #if defined(_MSC_VER)
  11. # pragma once
  12. #endif
  13. #include <boost/config.hpp>
  14. #include <boost/integer.hpp>
  15. #include <boost/mpl/bool.hpp>
  16. #include <boost/throw_exception.hpp>
  17. #include <boost/numeric/conversion/converter.hpp>
  18. #include <boost/xpressive/detail/detail_fwd.hpp>
  19. #include <boost/xpressive/detail/dynamic/parser_enum.hpp>
  20. #include <boost/xpressive/detail/utility/literals.hpp>
  21. #include <boost/xpressive/detail/utility/chset/chset.hpp>
  22. #include <boost/xpressive/regex_constants.hpp>
  23. namespace boost { namespace xpressive { namespace detail
  24. {
  25. enum escape_type
  26. {
  27. escape_char
  28. , escape_mark
  29. , escape_class
  30. };
  31. ///////////////////////////////////////////////////////////////////////////////
  32. // escape_value
  33. //
  34. template<typename Char, typename Class>
  35. struct escape_value
  36. {
  37. Char ch_;
  38. int mark_nbr_;
  39. Class class_;
  40. escape_type type_;
  41. };
  42. ///////////////////////////////////////////////////////////////////////////////
  43. // char_overflow_handler
  44. //
  45. struct char_overflow_handler
  46. {
  47. void operator ()(numeric::range_check_result result) const // throw(regex_error)
  48. {
  49. if(numeric::cInRange != result)
  50. {
  51. BOOST_THROW_EXCEPTION(
  52. regex_error(
  53. regex_constants::error_escape
  54. , "character escape too large to fit in target character type"
  55. )
  56. );
  57. }
  58. }
  59. };
  60. ///////////////////////////////////////////////////////////////////////////////
  61. // parse_escape
  62. //
  63. template<typename FwdIter, typename CompilerTraits>
  64. escape_value<typename iterator_value<FwdIter>::type, typename CompilerTraits::regex_traits::char_class_type>
  65. parse_escape(FwdIter &begin, FwdIter end, CompilerTraits &tr)
  66. {
  67. using namespace regex_constants;
  68. typedef typename iterator_value<FwdIter>::type char_type;
  69. typedef typename CompilerTraits::regex_traits regex_traits;
  70. typedef typename regex_traits::char_class_type char_class_type;
  71. // define an unsigned type the same size as char_type
  72. typedef typename boost::uint_t<CHAR_BIT * sizeof(char_type)>::least uchar_t;
  73. BOOST_MPL_ASSERT_RELATION(sizeof(uchar_t), ==, sizeof(char_type));
  74. typedef numeric::conversion_traits<uchar_t, int> converstion_traits;
  75. BOOST_XPR_ENSURE_(begin != end, error_escape, "unexpected end of pattern found");
  76. numeric::converter<int, uchar_t, converstion_traits, char_overflow_handler> converter;
  77. escape_value<char_type,char_class_type> esc = { 0, 0, 0, escape_char };
  78. bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
  79. regex_traits const &rxtraits = tr.traits();
  80. FwdIter tmp;
  81. esc.class_ = rxtraits.lookup_classname(begin, begin + 1, icase);
  82. if(0 != esc.class_)
  83. {
  84. esc.type_ = escape_class;
  85. return esc;
  86. }
  87. if(-1 != rxtraits.value(*begin, 8))
  88. {
  89. esc.ch_ = converter(toi(begin, end, rxtraits, 8, 0777));
  90. return esc;
  91. }
  92. switch(*begin)
  93. {
  94. // bell character
  95. case BOOST_XPR_CHAR_(char_type, 'a'):
  96. esc.ch_ = BOOST_XPR_CHAR_(char_type, '\a');
  97. ++begin;
  98. break;
  99. // escape character
  100. case BOOST_XPR_CHAR_(char_type, 'e'):
  101. esc.ch_ = converter(27);
  102. ++begin;
  103. break;
  104. // control character
  105. case BOOST_XPR_CHAR_(char_type, 'c'):
  106. BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
  107. BOOST_XPR_ENSURE_
  108. (
  109. rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'a'), BOOST_XPR_CHAR_(char_type, 'z'), *begin)
  110. || rxtraits.in_range(BOOST_XPR_CHAR_(char_type, 'A'), BOOST_XPR_CHAR_(char_type, 'Z'), *begin)
  111. , error_escape
  112. , "invalid escape control letter; must be one of a-z or A-Z"
  113. );
  114. // Convert to character according to ECMA-262, section 15.10.2.10:
  115. esc.ch_ = converter(*begin % 32);
  116. ++begin;
  117. break;
  118. // formfeed character
  119. case BOOST_XPR_CHAR_(char_type, 'f'):
  120. esc.ch_ = BOOST_XPR_CHAR_(char_type, '\f');
  121. ++begin;
  122. break;
  123. // newline
  124. case BOOST_XPR_CHAR_(char_type, 'n'):
  125. esc.ch_ = BOOST_XPR_CHAR_(char_type, '\n');
  126. ++begin;
  127. break;
  128. // return
  129. case BOOST_XPR_CHAR_(char_type, 'r'):
  130. esc.ch_ = BOOST_XPR_CHAR_(char_type, '\r');
  131. ++begin;
  132. break;
  133. // horizontal tab
  134. case BOOST_XPR_CHAR_(char_type, 't'):
  135. esc.ch_ = BOOST_XPR_CHAR_(char_type, '\t');
  136. ++begin;
  137. break;
  138. // vertical tab
  139. case BOOST_XPR_CHAR_(char_type, 'v'):
  140. esc.ch_ = BOOST_XPR_CHAR_(char_type, '\v');
  141. ++begin;
  142. break;
  143. // hex escape sequence
  144. case BOOST_XPR_CHAR_(char_type, 'x'):
  145. BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
  146. tmp = begin;
  147. esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xff));
  148. BOOST_XPR_ENSURE_(2 == std::distance(tmp, begin), error_escape, "invalid hex escape : "
  149. "must be \\x HexDigit HexDigit");
  150. break;
  151. // Unicode escape sequence
  152. case BOOST_XPR_CHAR_(char_type, 'u'):
  153. BOOST_XPR_ENSURE_(++begin != end, error_escape, "unexpected end of pattern found");
  154. tmp = begin;
  155. esc.ch_ = converter(toi(begin, end, rxtraits, 16, 0xffff));
  156. BOOST_XPR_ENSURE_(4 == std::distance(tmp, begin), error_escape, "invalid Unicode escape : "
  157. "must be \\u HexDigit HexDigit HexDigit HexDigit");
  158. break;
  159. // backslash
  160. case BOOST_XPR_CHAR_(char_type, '\\'):
  161. //esc.ch_ = BOOST_XPR_CHAR_(char_type, '\\');
  162. //++begin;
  163. //break;
  164. // all other escaped characters represent themselves
  165. default:
  166. esc.ch_ = *begin;
  167. ++begin;
  168. break;
  169. }
  170. return esc;
  171. }
  172. //////////////////////////////////////////////////////////////////////////
  173. // parse_charset
  174. //
  175. template<typename FwdIter, typename RegexTraits, typename CompilerTraits>
  176. inline void parse_charset
  177. (
  178. FwdIter &begin
  179. , FwdIter end
  180. , compound_charset<RegexTraits> &chset
  181. , CompilerTraits &tr
  182. )
  183. {
  184. using namespace regex_constants;
  185. typedef typename RegexTraits::char_type char_type;
  186. typedef typename RegexTraits::char_class_type char_class_type;
  187. BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
  188. RegexTraits const &rxtraits = tr.traits();
  189. bool const icase = (0 != (regex_constants::icase_ & tr.flags()));
  190. FwdIter iprev = FwdIter();
  191. escape_value<char_type, char_class_type> esc = {0, 0, 0, escape_char};
  192. bool invert = false;
  193. // check to see if we have an inverse charset
  194. if(begin != end && token_charset_invert == tr.get_charset_token(iprev = begin, end))
  195. {
  196. begin = iprev;
  197. invert = true;
  198. }
  199. // skip the end token if-and-only-if it is the first token in the charset
  200. if(begin != end && token_charset_end == tr.get_charset_token(iprev = begin, end))
  201. {
  202. for(; begin != iprev; ++begin)
  203. {
  204. chset.set_char(*begin, rxtraits, icase);
  205. }
  206. }
  207. compiler_token_type tok;
  208. char_type ch_prev = char_type(), ch_next = char_type();
  209. bool have_prev = false;
  210. BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
  211. // remember the current position and grab the next token
  212. iprev = begin;
  213. tok = tr.get_charset_token(begin, end);
  214. do
  215. {
  216. BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
  217. if(token_charset_hyphen == tok && have_prev)
  218. {
  219. // remember the current position
  220. FwdIter iprev2 = begin;
  221. have_prev = false;
  222. // ch_prev is lower bound of a range
  223. switch(tr.get_charset_token(begin, end))
  224. {
  225. case token_charset_hyphen:
  226. case token_charset_invert:
  227. begin = iprev2; // un-get these tokens and fall through
  228. BOOST_FALLTHROUGH;
  229. case token_literal:
  230. ch_next = *begin++;
  231. BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
  232. chset.set_range(ch_prev, ch_next, rxtraits, icase);
  233. continue;
  234. case token_charset_backspace:
  235. ch_next = char_type(8); // backspace
  236. BOOST_XPR_ENSURE_(ch_prev <= ch_next, error_range, "invalid charset range");
  237. chset.set_range(ch_prev, ch_next, rxtraits, icase);
  238. continue;
  239. case token_escape:
  240. esc = parse_escape(begin, end, tr);
  241. if(escape_char == esc.type_)
  242. {
  243. BOOST_XPR_ENSURE_(ch_prev <= esc.ch_, error_range, "invalid charset range");
  244. chset.set_range(ch_prev, esc.ch_, rxtraits, icase);
  245. continue;
  246. }
  247. BOOST_FALLTHROUGH;
  248. case token_charset_end:
  249. default: // not a range.
  250. begin = iprev; // backup to hyphen token
  251. chset.set_char(ch_prev, rxtraits, icase);
  252. chset.set_char(*begin++, rxtraits, icase);
  253. continue;
  254. }
  255. }
  256. if(have_prev)
  257. {
  258. chset.set_char(ch_prev, rxtraits, icase);
  259. have_prev = false;
  260. }
  261. switch(tok)
  262. {
  263. case token_charset_hyphen:
  264. case token_charset_invert:
  265. case token_charset_end:
  266. case token_posix_charset_end:
  267. begin = iprev; // un-get these tokens
  268. ch_prev = *begin++;
  269. have_prev = true;
  270. continue;
  271. case token_charset_backspace:
  272. ch_prev = char_type(8); // backspace
  273. have_prev = true;
  274. continue;
  275. case token_posix_charset_begin:
  276. {
  277. FwdIter tmp = begin, start = begin;
  278. bool invert = (token_charset_invert == tr.get_charset_token(tmp, end));
  279. if(invert)
  280. {
  281. begin = start = tmp;
  282. }
  283. while(token_literal == (tok = tr.get_charset_token(begin, end)))
  284. {
  285. tmp = ++begin;
  286. BOOST_XPR_ENSURE_(begin != end, error_brack, "unexpected end of pattern found");
  287. }
  288. if(token_posix_charset_end == tok)
  289. {
  290. char_class_type chclass = rxtraits.lookup_classname(start, tmp, icase);
  291. BOOST_XPR_ENSURE_(0 != chclass, error_ctype, "unknown class name");
  292. chset.set_class(chclass, invert);
  293. continue;
  294. }
  295. begin = iprev; // un-get this token
  296. ch_prev = *begin++;
  297. have_prev = true;
  298. }
  299. continue;
  300. case token_escape:
  301. esc = parse_escape(begin, end, tr);
  302. if(escape_char == esc.type_)
  303. {
  304. ch_prev = esc.ch_;
  305. have_prev = true;
  306. }
  307. else if(escape_class == esc.type_)
  308. {
  309. char_class_type upper_ = lookup_classname(rxtraits, "upper");
  310. BOOST_ASSERT(0 != upper_);
  311. chset.set_class(esc.class_, rxtraits.isctype(*begin++, upper_));
  312. }
  313. else
  314. {
  315. BOOST_ASSERT(false);
  316. }
  317. continue;
  318. default:
  319. ch_prev = *begin++;
  320. have_prev = true;
  321. continue;
  322. }
  323. }
  324. while(BOOST_XPR_ENSURE_((iprev = begin) != end, error_brack, "unexpected end of pattern found"),
  325. token_charset_end != (tok = tr.get_charset_token(begin, end)));
  326. if(have_prev)
  327. {
  328. chset.set_char(ch_prev, rxtraits, icase);
  329. }
  330. if(invert)
  331. {
  332. chset.inverse();
  333. }
  334. }
  335. }}} // namespace boost::xpressive::detail
  336. #endif