string_token.hpp 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406
  1. // string_token.hpp
  2. // Copyright (c) 2007-2009 Ben Hanson (http://www.benhanson.net/)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file licence_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. #ifndef BOOST_LEXER_STRING_TOKEN_HPP
  7. #define BOOST_LEXER_STRING_TOKEN_HPP
  8. #include <algorithm>
  9. #include "size_t.hpp"
  10. #include "consts.hpp" // num_chars, num_wchar_ts
  11. #include <string>
  12. #include <limits>
  13. namespace boost
  14. {
  15. namespace lexer
  16. {
  17. template<typename CharT>
  18. struct basic_string_token
  19. {
  20. typedef std::basic_string<CharT> string;
  21. bool _negated;
  22. string _charset;
  23. basic_string_token () :
  24. _negated (false)
  25. {
  26. }
  27. basic_string_token (const bool negated_, const string &charset_) :
  28. _negated (negated_),
  29. _charset (charset_)
  30. {
  31. }
  32. void remove_duplicates ()
  33. {
  34. const CharT *start_ = _charset.c_str ();
  35. const CharT *end_ = start_ + _charset.size ();
  36. // Optimisation for very large charsets:
  37. // sorting via pointers is much quicker than
  38. // via iterators...
  39. std::sort (const_cast<CharT *> (start_), const_cast<CharT *> (end_));
  40. _charset.erase (std::unique (_charset.begin (), _charset.end ()),
  41. _charset.end ());
  42. }
  43. void normalise ()
  44. {
  45. const std::size_t max_chars_ = sizeof (CharT) == 1 ?
  46. num_chars : num_wchar_ts;
  47. if (_charset.length () == max_chars_)
  48. {
  49. _negated = !_negated;
  50. _charset.clear ();
  51. }
  52. else if (_charset.length () > max_chars_ / 2)
  53. {
  54. negate ();
  55. }
  56. }
  57. void negate ()
  58. {
  59. const std::size_t max_chars_ = sizeof (CharT) == 1 ?
  60. num_chars : num_wchar_ts;
  61. CharT curr_char_ = (std::numeric_limits<CharT>::min)();
  62. string temp_;
  63. const CharT *curr_ = _charset.c_str ();
  64. const CharT *chars_end_ = curr_ + _charset.size ();
  65. _negated = !_negated;
  66. temp_.resize (max_chars_ - _charset.size ());
  67. CharT *ptr_ = const_cast<CharT *> (temp_.c_str ());
  68. std::size_t i_ = 0;
  69. while (curr_ < chars_end_)
  70. {
  71. while (*curr_ > curr_char_)
  72. {
  73. *ptr_ = curr_char_;
  74. ++ptr_;
  75. ++curr_char_;
  76. ++i_;
  77. }
  78. ++curr_char_;
  79. ++curr_;
  80. ++i_;
  81. }
  82. for (; i_ < max_chars_; ++i_)
  83. {
  84. *ptr_ = curr_char_;
  85. ++ptr_;
  86. ++curr_char_;
  87. }
  88. _charset = temp_;
  89. }
  90. bool operator < (const basic_string_token &rhs_) const
  91. {
  92. return _negated < rhs_._negated ||
  93. (_negated == rhs_._negated && _charset < rhs_._charset);
  94. }
  95. bool empty () const
  96. {
  97. return _charset.empty () && !_negated;
  98. }
  99. bool any () const
  100. {
  101. return _charset.empty () && _negated;
  102. }
  103. void clear ()
  104. {
  105. _negated = false;
  106. _charset.clear ();
  107. }
  108. void intersect (basic_string_token &rhs_, basic_string_token &overlap_)
  109. {
  110. if ((any () && rhs_.any ()) || (_negated == rhs_._negated &&
  111. !any () && !rhs_.any ()))
  112. {
  113. intersect_same_types (rhs_, overlap_);
  114. }
  115. else
  116. {
  117. intersect_diff_types (rhs_, overlap_);
  118. }
  119. }
  120. static void escape_char (const CharT ch_, string &out_)
  121. {
  122. switch (ch_)
  123. {
  124. case '\0':
  125. out_ += '\\';
  126. out_ += '0';
  127. break;
  128. case '\a':
  129. out_ += '\\';
  130. out_ += 'a';
  131. break;
  132. case '\b':
  133. out_ += '\\';
  134. out_ += 'b';
  135. break;
  136. case 27:
  137. out_ += '\\';
  138. out_ += 'x';
  139. out_ += '1';
  140. out_ += 'b';
  141. break;
  142. case '\f':
  143. out_ += '\\';
  144. out_ += 'f';
  145. break;
  146. case '\n':
  147. out_ += '\\';
  148. out_ += 'n';
  149. break;
  150. case '\r':
  151. out_ += '\\';
  152. out_ += 'r';
  153. break;
  154. case '\t':
  155. out_ += '\\';
  156. out_ += 't';
  157. break;
  158. case '\v':
  159. out_ += '\\';
  160. out_ += 'v';
  161. break;
  162. case '\\':
  163. out_ += '\\';
  164. out_ += '\\';
  165. break;
  166. case '"':
  167. out_ += '\\';
  168. out_ += '"';
  169. break;
  170. case '\'':
  171. out_ += '\\';
  172. out_ += '\'';
  173. break;
  174. default:
  175. {
  176. if (ch_ < 32 && ch_ >= 0)
  177. {
  178. std::basic_stringstream<CharT> ss_;
  179. out_ += '\\';
  180. out_ += 'x';
  181. ss_ << std::hex <<
  182. static_cast<std::size_t> (ch_);
  183. out_ += ss_.str ();
  184. }
  185. else
  186. {
  187. out_ += ch_;
  188. }
  189. break;
  190. }
  191. }
  192. }
  193. private:
  194. void intersect_same_types (basic_string_token &rhs_,
  195. basic_string_token &overlap_)
  196. {
  197. if (any ())
  198. {
  199. clear ();
  200. overlap_._negated = true;
  201. rhs_.clear ();
  202. }
  203. else
  204. {
  205. typename string::iterator iter_ = _charset.begin ();
  206. typename string::iterator end_ = _charset.end ();
  207. typename string::iterator rhs_iter_ = rhs_._charset.begin ();
  208. typename string::iterator rhs_end_ = rhs_._charset.end ();
  209. overlap_._negated = _negated;
  210. while (iter_ != end_ && rhs_iter_ != rhs_end_)
  211. {
  212. if (*iter_ < *rhs_iter_)
  213. {
  214. ++iter_;
  215. }
  216. else if (*iter_ > *rhs_iter_)
  217. {
  218. ++rhs_iter_;
  219. }
  220. else
  221. {
  222. overlap_._charset += *iter_;
  223. iter_ = _charset.erase (iter_);
  224. end_ = _charset.end ();
  225. rhs_iter_ = rhs_._charset.erase (rhs_iter_);
  226. rhs_end_ = rhs_._charset.end ();
  227. }
  228. }
  229. if (_negated)
  230. {
  231. // duplicates already merged, so safe to merge
  232. // using std lib.
  233. // src, dest
  234. merge (_charset, overlap_._charset);
  235. // duplicates already merged, so safe to merge
  236. // using std lib.
  237. // src, dest
  238. merge (rhs_._charset, overlap_._charset);
  239. _negated = false;
  240. rhs_._negated = false;
  241. std::swap (_charset, rhs_._charset);
  242. normalise ();
  243. overlap_.normalise ();
  244. rhs_.normalise ();
  245. }
  246. else if (!overlap_._charset.empty ())
  247. {
  248. normalise ();
  249. overlap_.normalise ();
  250. rhs_.normalise ();
  251. }
  252. }
  253. }
  254. void intersect_diff_types (basic_string_token &rhs_,
  255. basic_string_token &overlap_)
  256. {
  257. if (any ())
  258. {
  259. intersect_any (rhs_, overlap_);
  260. }
  261. else if (_negated)
  262. {
  263. intersect_negated (rhs_, overlap_);
  264. }
  265. else // _negated == false
  266. {
  267. intersect_charset (rhs_, overlap_);
  268. }
  269. }
  270. void intersect_any (basic_string_token &rhs_, basic_string_token &overlap_)
  271. {
  272. if (rhs_._negated)
  273. {
  274. rhs_.intersect_negated (*this, overlap_);
  275. }
  276. else // rhs._negated == false
  277. {
  278. rhs_.intersect_charset (*this, overlap_);
  279. }
  280. }
  281. void intersect_negated (basic_string_token &rhs_,
  282. basic_string_token &overlap_)
  283. {
  284. if (rhs_.any ())
  285. {
  286. overlap_._negated = true;
  287. overlap_._charset = _charset;
  288. rhs_._negated = false;
  289. rhs_._charset = _charset;
  290. clear ();
  291. }
  292. else // rhs._negated == false
  293. {
  294. rhs_.intersect_charset (*this, overlap_);
  295. }
  296. }
  297. void intersect_charset (basic_string_token &rhs_,
  298. basic_string_token &overlap_)
  299. {
  300. if (rhs_.any ())
  301. {
  302. overlap_._charset = _charset;
  303. rhs_._negated = true;
  304. rhs_._charset = _charset;
  305. clear ();
  306. }
  307. else // rhs_._negated == true
  308. {
  309. typename string::iterator iter_ = _charset.begin ();
  310. typename string::iterator end_ = _charset.end ();
  311. typename string::iterator rhs_iter_ = rhs_._charset.begin ();
  312. typename string::iterator rhs_end_ = rhs_._charset.end ();
  313. while (iter_ != end_ && rhs_iter_ != rhs_end_)
  314. {
  315. if (*iter_ < *rhs_iter_)
  316. {
  317. overlap_._charset += *iter_;
  318. rhs_iter_ = rhs_._charset.insert (rhs_iter_, *iter_);
  319. ++rhs_iter_;
  320. rhs_end_ = rhs_._charset.end ();
  321. iter_ = _charset.erase (iter_);
  322. end_ = _charset.end ();
  323. }
  324. else if (*iter_ > *rhs_iter_)
  325. {
  326. ++rhs_iter_;
  327. }
  328. else
  329. {
  330. ++iter_;
  331. ++rhs_iter_;
  332. }
  333. }
  334. if (iter_ != end_)
  335. {
  336. // nothing bigger in rhs_ than iter_,
  337. // so safe to merge using std lib.
  338. string temp_ (iter_, end_);
  339. // src, dest
  340. merge (temp_, overlap_._charset);
  341. _charset.erase (iter_, end_);
  342. }
  343. if (!overlap_._charset.empty ())
  344. {
  345. merge (overlap_._charset, rhs_._charset);
  346. // possible duplicates, so check for any and erase.
  347. rhs_._charset.erase (std::unique (rhs_._charset.begin (),
  348. rhs_._charset.end ()), rhs_._charset.end ());
  349. normalise ();
  350. overlap_.normalise ();
  351. rhs_.normalise ();
  352. }
  353. }
  354. }
  355. void merge (string &src_, string &dest_)
  356. {
  357. string tmp_ (src_.size () + dest_.size (), 0);
  358. std::merge (src_.begin (), src_.end (), dest_.begin (), dest_.end (),
  359. tmp_.begin ());
  360. dest_ = tmp_;
  361. }
  362. };
  363. }
  364. }
  365. #endif