query.hpp 9.6 KB


  1. /*=============================================================================
  2. Copyright (c) 2001-2011 Joel de Guzman
  3. Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. Autogenerated by MultiStageTable.py (Unicode multi-stage
  6. table builder) (c) Peter Kankowski, 2008
  7. ==============================================================================*/
  8. #if !defined(BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010)
  9. #define BOOST_SPIRIT_UNICODE_QUERY_FEBRUARY_2_2010
  10. #include <boost/cstdint.hpp>
  11. # include "category_table.hpp"
  12. # include "script_table.hpp"
  13. # include "lowercase_table.hpp"
  14. # include "uppercase_table.hpp"
  15. namespace boost { namespace spirit { namespace ucd
  16. {
  17. // This header provides Basic (Level 1) Unicode Support
  18. // See http://unicode.org/reports/tr18/ for details
  19. struct properties
  20. {
  21. // bit pattern: xxMMMCCC
  22. // MMM: major_category
  23. // CCC: category
  24. enum major_category
  25. {
  26. letter,
  27. mark,
  28. number,
  29. separator,
  30. other,
  31. punctuation,
  32. symbol
  33. };
  34. enum category
  35. {
  36. uppercase_letter = 0, // [Lu] an uppercase letter
  37. lowercase_letter, // [Ll] a lowercase letter
  38. titlecase_letter, // [Lt] a digraphic character, with first part uppercase
  39. modifier_letter, // [Lm] a modifier letter
  40. other_letter, // [Lo] other letters, including syllables and ideographs
  41. nonspacing_mark = 8, // [Mn] a nonspacing combining mark (zero advance width)
  42. enclosing_mark, // [Me] an enclosing combining mark
  43. spacing_mark, // [Mc] a spacing combining mark (positive advance width)
  44. decimal_number = 16, // [Nd] a decimal digit
  45. letter_number, // [Nl] a letterlike numeric character
  46. other_number, // [No] a numeric character of other type
  47. space_separator = 24, // [Zs] a space character (of various non-zero widths)
  48. line_separator, // [Zl] U+2028 LINE SEPARATOR only
  49. paragraph_separator, // [Zp] U+2029 PARAGRAPH SEPARATOR only
  50. control = 32, // [Cc] a C0 or C1 control code
  51. format, // [Cf] a format control character
  52. private_use, // [Co] a private-use character
  53. surrogate, // [Cs] a surrogate code point
  54. unassigned, // [Cn] a reserved unassigned code point or a noncharacter
  55. dash_punctuation = 40, // [Pd] a dash or hyphen punctuation mark
  56. open_punctuation, // [Ps] an opening punctuation mark (of a pair)
  57. close_punctuation, // [Pe] a closing punctuation mark (of a pair)
  58. connector_punctuation, // [Pc] a connecting punctuation mark, like a tie
  59. other_punctuation, // [Po] a punctuation mark of other type
  60. initial_punctuation, // [Pi] an initial quotation mark
  61. final_punctuation, // [Pf] a final quotation mark
  62. math_symbol = 48, // [Sm] a symbol of primarily mathematical use
  63. currency_symbol, // [Sc] a currency sign
  64. modifier_symbol, // [Sk] a non-letterlike modifier symbol
  65. other_symbol // [So] a symbol of other type
  66. };
  67. enum derived_properties
  68. {
  69. alphabetic = 64,
  70. uppercase = 128,
  71. lowercase = 256,
  72. white_space = 512,
  73. hex_digit = 1024,
  74. noncharacter_code_point = 2048,
  75. default_ignorable_code_point = 4096
  76. };
  77. enum script
  78. {
  79. arabic = 0,
  80. imperial_aramaic = 1,
  81. armenian = 2,
  82. avestan = 3,
  83. balinese = 4,
  84. bamum = 5,
  85. bengali = 6,
  86. bopomofo = 7,
  87. braille = 8,
  88. buginese = 9,
  89. buhid = 10,
  90. canadian_aboriginal = 11,
  91. carian = 12,
  92. cham = 13,
  93. cherokee = 14,
  94. coptic = 15,
  95. cypriot = 16,
  96. cyrillic = 17,
  97. devanagari = 18,
  98. deseret = 19,
  99. egyptian_hieroglyphs = 20,
  100. ethiopic = 21,
  101. georgian = 22,
  102. glagolitic = 23,
  103. gothic = 24,
  104. greek = 25,
  105. gujarati = 26,
  106. gurmukhi = 27,
  107. hangul = 28,
  108. han = 29,
  109. hanunoo = 30,
  110. hebrew = 31,
  111. hiragana = 32,
  112. katakana_or_hiragana = 33,
  113. old_italic = 34,
  114. javanese = 35,
  115. kayah_li = 36,
  116. katakana = 37,
  117. kharoshthi = 38,
  118. khmer = 39,
  119. kannada = 40,
  120. kaithi = 41,
  121. tai_tham = 42,
  122. lao = 43,
  123. latin = 44,
  124. lepcha = 45,
  125. limbu = 46,
  126. linear_b = 47,
  127. lisu = 48,
  128. lycian = 49,
  129. lydian = 50,
  130. malayalam = 51,
  131. mongolian = 52,
  132. meetei_mayek = 53,
  133. myanmar = 54,
  134. nko = 55,
  135. ogham = 56,
  136. ol_chiki = 57,
  137. old_turkic = 58,
  138. oriya = 59,
  139. osmanya = 60,
  140. phags_pa = 61,
  141. inscriptional_pahlavi = 62,
  142. phoenician = 63,
  143. inscriptional_parthian = 64,
  144. rejang = 65,
  145. runic = 66,
  146. samaritan = 67,
  147. old_south_arabian = 68,
  148. saurashtra = 69,
  149. shavian = 70,
  150. sinhala = 71,
  151. sundanese = 72,
  152. syloti_nagri = 73,
  153. syriac = 74,
  154. tagbanwa = 75,
  155. tai_le = 76,
  156. new_tai_lue = 77,
  157. tamil = 78,
  158. tai_viet = 79,
  159. telugu = 80,
  160. tifinagh = 81,
  161. tagalog = 82,
  162. thaana = 83,
  163. thai = 84,
  164. tibetan = 85,
  165. ugaritic = 86,
  166. vai = 87,
  167. old_persian = 88,
  168. cuneiform = 89,
  169. yi = 90,
  170. inherited = 91,
  171. common = 92,
  172. unknown = 93
  173. };
  174. };
  175. inline properties::category get_category(::boost::uint32_t ch)
  176. {
  177. return static_cast<properties::category>(detail::category_lookup(ch) & 0x3F);
  178. }
  179. inline properties::major_category get_major_category(::boost::uint32_t ch)
  180. {
  181. return static_cast<properties::major_category>(get_category(ch) >> 3);
  182. }
  183. inline bool is_punctuation(::boost::uint32_t ch)
  184. {
  185. return get_major_category(ch) == properties::punctuation;
  186. }
  187. inline bool is_decimal_number(::boost::uint32_t ch)
  188. {
  189. return get_category(ch) == properties::decimal_number;
  190. }
  191. inline bool is_hex_digit(::boost::uint32_t ch)
  192. {
  193. return (detail::category_lookup(ch) & properties::hex_digit) != 0;
  194. }
  195. inline bool is_control(::boost::uint32_t ch)
  196. {
  197. return get_category(ch) == properties::control;
  198. }
  199. inline bool is_alphabetic(::boost::uint32_t ch)
  200. {
  201. return (detail::category_lookup(ch) & properties::alphabetic) != 0;
  202. }
  203. inline bool is_alphanumeric(::boost::uint32_t ch)
  204. {
  205. return is_decimal_number(ch) || is_alphabetic(ch);
  206. }
  207. inline bool is_uppercase(::boost::uint32_t ch)
  208. {
  209. return (detail::category_lookup(ch) & properties::uppercase) != 0;
  210. }
  211. inline bool is_lowercase(::boost::uint32_t ch)
  212. {
  213. return (detail::category_lookup(ch) & properties::lowercase) != 0;
  214. }
  215. inline bool is_white_space(::boost::uint32_t ch)
  216. {
  217. return (detail::category_lookup(ch) & properties::white_space) != 0;
  218. }
  219. inline bool is_blank(::boost::uint32_t ch)
  220. {
  221. switch (ch)
  222. {
  223. case '\n': case '\v': case '\f': case '\r':
  224. return false;
  225. default:
  226. return is_white_space(ch)
  227. && !( get_category(ch) == properties::line_separator
  228. || get_category(ch) == properties::paragraph_separator
  229. );
  230. }
  231. }
  232. inline bool is_graph(::boost::uint32_t ch)
  233. {
  234. return !( is_white_space(ch)
  235. || get_category(ch) == properties::control
  236. || get_category(ch) == properties::surrogate
  237. || get_category(ch) == properties::unassigned
  238. );
  239. }
  240. inline bool is_print(::boost::uint32_t ch)
  241. {
  242. return (is_graph(ch) || is_blank(ch)) && !is_control(ch);
  243. }
  244. inline bool is_noncharacter_code_point(::boost::uint32_t ch)
  245. {
  246. return (detail::category_lookup(ch) & properties::noncharacter_code_point) != 0;
  247. }
  248. inline bool is_default_ignorable_code_point(::boost::uint32_t ch)
  249. {
  250. return (detail::category_lookup(ch) & properties::default_ignorable_code_point) != 0;
  251. }
  252. inline properties::script get_script(::boost::uint32_t ch)
  253. {
  254. return static_cast<properties::script>(detail::script_lookup(ch) & 0x7F);
  255. }
  256. inline ::boost::uint32_t to_lowercase(::boost::uint32_t ch)
  257. {
  258. // The table returns 0 to signal that this code maps to itself
  259. ::boost::uint32_t r = detail::lowercase_lookup(ch);
  260. return (r == 0)? ch : r;
  261. }
  262. inline ::boost::uint32_t to_uppercase(::boost::uint32_t ch)
  263. {
  264. // The table returns 0 to signal that this code maps to itself
  265. ::boost::uint32_t r = detail::uppercase_lookup(ch);
  266. return (r == 0)? ch : r;
  267. }
  268. }}}
  269. #endif