create_tables.cpp 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584
  1. /*=============================================================================
  2. Copyright (c) 2001-2011 Joel de Guzman
  3. Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. =============================================================================*/
  6. #include <boost/config/warning_disable.hpp>
  7. #include <boost/spirit/include/qi.hpp>
  8. #include <boost/spirit/include/phoenix.hpp>
  9. #include <boost/unordered_map.hpp>
  10. #include <boost/algorithm/string/trim.hpp>
  11. #include <boost/cstdint.hpp>
  12. #include <boost/foreach.hpp>
  13. #include <boost/array.hpp>
  14. #include <boost/scoped_array.hpp>
  15. #include <boost/range/iterator_range.hpp>
  16. #include <iostream>
  17. #include <iomanip>
  18. #include <fstream>
  19. #include <vector>
  20. #include <algorithm>
  21. #include <string>
  22. #include <map>
  23. // We place the data here. Each line comprises various fields
  24. typedef std::vector<std::string> ucd_line;
  25. typedef std::vector<ucd_line> ucd_vector;
  26. typedef std::vector<ucd_line>::iterator ucd_iterator;
  27. // spirit and phoenix using declarations
  28. using boost::spirit::qi::parse;
  29. using boost::spirit::qi::hex;
  30. using boost::spirit::qi::char_;
  31. using boost::spirit::qi::eol;
  32. using boost::spirit::qi::rule;
  33. using boost::spirit::qi::omit;
  34. using boost::spirit::qi::_1;
  35. using boost::spirit::qi::_val;
  36. using boost::phoenix::push_back;
  37. using boost::phoenix::ref;
  38. // basic unsigned types
  39. using boost::uint8_t;
  40. using boost::uint16_t;
  41. using boost::uint32_t;
  42. // a char range
  43. struct ucd_range
  44. {
  45. ucd_range(uint32_t start, uint32_t finish)
  46. : start(start), finish(finish) {}
  47. // we need this so we can use ucd_range as a multimap key
  48. friend bool operator<(ucd_range const& a, ucd_range const& b)
  49. {
  50. return a.start < b.start;
  51. }
  52. uint32_t start;
  53. uint32_t finish;
  54. };
  55. class ucd_info
  56. {
  57. public:
  58. ucd_info(char const* filename)
  59. {
  60. std::ifstream in(filename, std::ios_base::in);
  61. if (!in)
  62. {
  63. std::cerr << "Error: Could not open input file: "
  64. << filename << std::endl;
  65. }
  66. else
  67. {
  68. std::string data; // We will read the contents here.
  69. in.unsetf(std::ios::skipws); // No white space skipping!
  70. std::copy(
  71. std::istream_iterator<char>(in),
  72. std::istream_iterator<char>(),
  73. std::back_inserter(data));
  74. typedef std::string::const_iterator iterator_type;
  75. iterator_type f = data.begin();
  76. iterator_type l = data.end();
  77. rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
  78. rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
  79. rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
  80. rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);
  81. parse(f, l, file, info);
  82. }
  83. }
  84. template <typename Array>
  85. void collect(Array& data, int field, bool collect_properties = true) const
  86. {
  87. BOOST_ASSERT(!info.empty());
  88. ucd_vector::const_iterator current = info.begin();
  89. ucd_vector::const_iterator end = info.end();
  90. while (current != end)
  91. {
  92. std::string range = (*current)[0];
  93. boost::trim(range);
  94. std::string::const_iterator f = range.begin();
  95. std::string::const_iterator l = range.end();
  96. // get the code-point range
  97. uint32_t start;
  98. uint32_t finish;
  99. parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));
  100. // special case for UnicodeData.txt ranges:
  101. if ((*current)[1].find("First>") != std::string::npos)
  102. {
  103. ++current;
  104. BOOST_ASSERT(current != end);
  105. BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);
  106. std::string range = (*current)[0];
  107. boost::trim(range);
  108. f = range.begin();
  109. l = range.end();
  110. parse(f, l, hex[ref(finish) = _1]);
  111. }
  112. std::string code;
  113. if (field < int(current->size()))
  114. code = (*current)[field];
  115. boost::trim(code);
  116. // Only collect properties we are interested in
  117. if (collect_properties) // code for properties
  118. {
  119. if (!ignore_property(code))
  120. {
  121. for (uint32_t i = start; i <= finish; ++i)
  122. data[i] |= map_property(code);
  123. }
  124. }
  125. else // code for actual numeric values
  126. {
  127. for (uint32_t i = start; i <= finish; ++i)
  128. {
  129. if (code.empty())
  130. {
  131. data[i] = 0; // signal that this code maps to itself
  132. }
  133. else
  134. {
  135. f = code.begin();
  136. l = code.end();
  137. parse(f, l, hex, data[i]);
  138. }
  139. }
  140. }
  141. ++current;
  142. }
  143. }
  144. private:
  145. static bool ignore_property(std::string const& p)
  146. {
  147. // We don't handle all properties
  148. std::map<std::string, int>& pm = get_property_map();
  149. std::map<std::string, int>::iterator i = pm.find(p);
  150. return i == pm.end();
  151. }
  152. static int
  153. map_property(std::string const& p)
  154. {
  155. std::map<std::string, int>& pm = get_property_map();
  156. std::map<std::string, int>::iterator i = pm.find(p);
  157. BOOST_ASSERT(i != pm.end());
  158. return i->second;
  159. }
  160. static std::map<std::string, int>&
  161. get_property_map()
  162. {
  163. // The properties we are interested in:
  164. static std::map<std::string, int> map;
  165. if (map.empty())
  166. {
  167. // General_Category
  168. map["Lu"] = 0;
  169. map["Ll"] = 1;
  170. map["Lt"] = 2;
  171. map["Lm"] = 3;
  172. map["Lo"] = 4;
  173. map["Mn"] = 8;
  174. map["Me"] = 9;
  175. map["Mc"] = 10;
  176. map["Nd"] = 16;
  177. map["Nl"] = 17;
  178. map["No"] = 18;
  179. map["Zs"] = 24;
  180. map["Zl"] = 25;
  181. map["Zp"] = 26;
  182. map["Cc"] = 32;
  183. map["Cf"] = 33;
  184. map["Co"] = 34;
  185. map["Cs"] = 35;
  186. map["Cn"] = 36;
  187. map["Pd"] = 40;
  188. map["Ps"] = 41;
  189. map["Pe"] = 42;
  190. map["Pc"] = 43;
  191. map["Po"] = 44;
  192. map["Pi"] = 45;
  193. map["Pf"] = 46;
  194. map["Sm"] = 48;
  195. map["Sc"] = 49;
  196. map["Sk"] = 50;
  197. map["So"] = 51;
  198. // Derived Properties.
  199. map["Alphabetic"] = 64;
  200. map["Uppercase"] = 128;
  201. map["Lowercase"] = 256;
  202. map["White_Space"] = 512;
  203. map["Hex_Digit"] = 1024;
  204. map["Noncharacter_Code_Point"] = 2048;
  205. map["Default_Ignorable_Code_Point"] = 4096;
  206. // Script
  207. map["Arabic"] = 0;
  208. map["Imperial_Aramaic"] = 1;
  209. map["Armenian"] = 2;
  210. map["Avestan"] = 3;
  211. map["Balinese"] = 4;
  212. map["Bamum"] = 5;
  213. map["Bengali"] = 6;
  214. map["Bopomofo"] = 7;
  215. map["Braille"] = 8;
  216. map["Buginese"] = 9;
  217. map["Buhid"] = 10;
  218. map["Canadian_Aboriginal"] = 11;
  219. map["Carian"] = 12;
  220. map["Cham"] = 13;
  221. map["Cherokee"] = 14;
  222. map["Coptic"] = 15;
  223. map["Cypriot"] = 16;
  224. map["Cyrillic"] = 17;
  225. map["Devanagari"] = 18;
  226. map["Deseret"] = 19;
  227. map["Egyptian_Hieroglyphs"] = 20;
  228. map["Ethiopic"] = 21;
  229. map["Georgian"] = 22;
  230. map["Glagolitic"] = 23;
  231. map["Gothic"] = 24;
  232. map["Greek"] = 25;
  233. map["Gujarati"] = 26;
  234. map["Gurmukhi"] = 27;
  235. map["Hangul"] = 28;
  236. map["Han"] = 29;
  237. map["Hanunoo"] = 30;
  238. map["Hebrew"] = 31;
  239. map["Hiragana"] = 32;
  240. map["Katakana_Or_Hiragana"] = 33;
  241. map["Old_Italic"] = 34;
  242. map["Javanese"] = 35;
  243. map["Kayah_Li"] = 36;
  244. map["Katakana"] = 37;
  245. map["Kharoshthi"] = 38;
  246. map["Khmer"] = 39;
  247. map["Kannada"] = 40;
  248. map["Kaithi"] = 41;
  249. map["Tai_Tham"] = 42;
  250. map["Lao"] = 43;
  251. map["Latin"] = 44;
  252. map["Lepcha"] = 45;
  253. map["Limbu"] = 46;
  254. map["Linear_B"] = 47;
  255. map["Lisu"] = 48;
  256. map["Lycian"] = 49;
  257. map["Lydian"] = 50;
  258. map["Malayalam"] = 51;
  259. map["Mongolian"] = 52;
  260. map["Meetei_Mayek"] = 53;
  261. map["Myanmar"] = 54;
  262. map["Nko"] = 55;
  263. map["Ogham"] = 56;
  264. map["Ol_Chiki"] = 57;
  265. map["Old_Turkic"] = 58;
  266. map["Oriya"] = 59;
  267. map["Osmanya"] = 60;
  268. map["Phags_Pa"] = 61;
  269. map["Inscriptional_Pahlavi"] = 62;
  270. map["Phoenician"] = 63;
  271. map["Inscriptional_Parthian"] = 64;
  272. map["Rejang"] = 65;
  273. map["Runic"] = 66;
  274. map["Samaritan"] = 67;
  275. map["Old_South_Arabian"] = 68;
  276. map["Saurashtra"] = 69;
  277. map["Shavian"] = 70;
  278. map["Sinhala"] = 71;
  279. map["Sundanese"] = 72;
  280. map["Syloti_Nagri"] = 73;
  281. map["Syriac"] = 74;
  282. map["Tagbanwa"] = 75;
  283. map["Tai_Le"] = 76;
  284. map["New_Tai_Lue"] = 77;
  285. map["Tamil"] = 78;
  286. map["Tai_Viet"] = 79;
  287. map["Telugu"] = 80;
  288. map["Tifinagh"] = 81;
  289. map["Tagalog"] = 82;
  290. map["Thaana"] = 83;
  291. map["Thai"] = 84;
  292. map["Tibetan"] = 85;
  293. map["Ugaritic"] = 86;
  294. map["Vai"] = 87;
  295. map["Old_Persian"] = 88;
  296. map["Cuneiform"] = 89;
  297. map["Yi"] = 90;
  298. map["Inherited"] = 91;
  299. map["Common"] = 92;
  300. map["Unknown"] = 93;
  301. }
  302. return map;
  303. }
  304. ucd_vector info;
  305. };
  306. template <typename T, uint32_t block_size_ = 256>
  307. class ucd_table_builder
  308. {
  309. public:
  310. static uint32_t const block_size = block_size_;
  311. static uint32_t const full_span = 0x110000;
  312. typedef T value_type;
  313. ucd_table_builder() : p(new T[full_span])
  314. {
  315. for (uint32_t i = 0; i < full_span; ++i)
  316. p[i] = 0;
  317. }
  318. void collect(char const* filename, int field, bool collect_properties = true)
  319. {
  320. std::cout << "collecting " << filename << std::endl;
  321. ucd_info info(filename);
  322. info.collect(p, field, collect_properties);
  323. }
  324. void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
  325. {
  326. std::cout << "building tables" << std::endl;
  327. std::map<block_ptr, std::vector<T const*> > blocks;
  328. for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
  329. blocks[block_ptr(i)].push_back(i);
  330. // Not enough bits to store the block indices.
  331. BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));
  332. typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
  333. std::map<T const*, std::vector<T const*> > sorted_blocks;
  334. BOOST_FOREACH(blocks_value_type const& val, blocks)
  335. {
  336. sorted_blocks[val.first.p] = val.second;
  337. }
  338. stage1.clear();
  339. stage1.reserve(full_span / block_size);
  340. stage1.resize(full_span / block_size);
  341. stage2.clear();
  342. stage2.reserve(blocks.size());
  343. typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
  344. BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
  345. {
  346. stage2.push_back(val.first);
  347. BOOST_FOREACH(T const* val2, val.second)
  348. {
  349. stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
  350. }
  351. }
  352. }
  353. private:
  354. struct block_ptr
  355. {
  356. block_ptr(T const* p) : p(p) {}
  357. friend bool operator<(block_ptr a, block_ptr b)
  358. {
  359. return std::lexicographical_compare(
  360. a.p, a.p + block_size, b.p, b.p + block_size);
  361. }
  362. T const* p;
  363. };
  364. boost::scoped_array<T> p;
  365. };
  366. template <typename Out>
  367. void print_tab(Out& out, int tab)
  368. {
  369. for (int i = 0; i < tab; ++i)
  370. out << ' ';
  371. }
  372. template <typename Out, typename C>
  373. void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
  374. {
  375. int const tab = 4;
  376. typename C::size_type size = c.size();
  377. BOOST_ASSERT(size > 1);
  378. print_tab(out, tab);
  379. out << std::setw(width) << int(c[0]);
  380. for (C::size_type i = 1; i < size; ++i)
  381. {
  382. out << ", ";
  383. if ((i % group) == 0)
  384. {
  385. out << std::endl;
  386. print_tab(out, tab);
  387. }
  388. out << std::setw(width) << int(c[i]);
  389. }
  390. if (trailing_comma)
  391. out << ", " << std::endl;
  392. }
  393. template <typename Out>
  394. void print_head(Out& out)
  395. {
  396. out
  397. << "/*=============================================================================\n"
  398. << " Copyright (c) 2001-2011 Joel de Guzman\n"
  399. << "\n"
  400. << " Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
  401. << " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
  402. << "\n"
  403. << " AUTOGENERATED. DO NOT EDIT!!!\n"
  404. << "==============================================================================*/\n"
  405. << "#include <boost/cstdint.hpp>\n"
  406. << "\n"
  407. << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
  408. << "{"
  409. ;
  410. }
  411. template <typename Out>
  412. void print_tail(Out& out)
  413. {
  414. out
  415. << "\n"
  416. << "}}}} // namespace boost::spirit::unicode::detail\n"
  417. ;
  418. }
  419. char const* get_int_type_name(int size)
  420. {
  421. switch (size)
  422. {
  423. case 1: return "::boost::uint8_t";
  424. case 2: return "::boost::uint16_t";
  425. case 4: return "::boost::uint32_t";
  426. case 5: return "::boost::uint64_t";
  427. default: BOOST_ASSERT(false); return 0; // invalid size
  428. };
  429. }
  430. template <typename Out, typename Builder>
  431. void print_file(Out& out, Builder& builder, int field_width, char const* name)
  432. {
  433. std::cout << "Generating " << name << " tables" << std::endl;
  434. uint32_t const block_size = Builder::block_size;
  435. typedef typename Builder::value_type value_type;
  436. print_head(out);
  437. std::vector<uint8_t> stage1;
  438. std::vector<value_type const*> stage2;
  439. builder.build(stage1, stage2);
  440. std::cout << "Block Size: " << block_size << std::endl;
  441. std::cout << "Total Bytes: "
  442. << stage1.size()+(stage2.size()*block_size*sizeof(value_type))
  443. << std::endl;
  444. out
  445. << "\n"
  446. << " static const ::boost::uint8_t " << name << "_stage1[] = {\n"
  447. << "\n"
  448. ;
  449. print_table(out, stage1, false, 3);
  450. char const* int_name = get_int_type_name(sizeof(value_type));
  451. out
  452. << "\n"
  453. << " };"
  454. << "\n"
  455. << "\n"
  456. << " static const " << int_name << ' ' << name << "_stage2[] = {"
  457. ;
  458. int block_n = 0;
  459. for (int i = 0; i < int(stage2.size()); ++i)
  460. {
  461. value_type const* p = stage2[i];
  462. bool last = (i+1 == stage2.size());
  463. out << "\n\n // block " << block_n++ << std::endl;
  464. print_table(out,
  465. boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
  466. }
  467. out
  468. << "\n"
  469. << " };"
  470. << "\n"
  471. ;
  472. out
  473. << "\n"
  474. << " inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
  475. << " {\n"
  476. << " ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
  477. << " return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
  478. << " }\n"
  479. ;
  480. print_tail(out);
  481. }
  482. int main()
  483. {
  484. // The category tables
  485. {
  486. std::ofstream out("category_table.hpp");
  487. ucd_table_builder<uint16_t, 256> builder;
  488. builder.collect("UnicodeData.txt", 2);
  489. builder.collect("DerivedCoreProperties.txt", 1);
  490. builder.collect("PropList.txt", 1);
  491. print_file(out, builder, 4, "category");
  492. }
  493. // The script tables
  494. {
  495. std::ofstream out("script_table.hpp");
  496. ucd_table_builder<uint8_t, 256> builder;
  497. builder.collect("Scripts.txt", 1);
  498. print_file(out, builder, 3, "script");
  499. }
  500. // The lowercase tables
  501. {
  502. std::ofstream out("lowercase_table.hpp");
  503. ucd_table_builder<uint32_t, 256> builder;
  504. builder.collect("UnicodeData.txt", 13, false);
  505. print_file(out, builder, 6, "lowercase");
  506. }
  507. // The uppercase tables
  508. {
  509. std::ofstream out("uppercase_table.hpp");
  510. ucd_table_builder<uint32_t, 256> builder;
  511. builder.collect("UnicodeData.txt", 12, false);
  512. print_file(out, builder, 6, "uppercase");
  513. }
  514. return 0;
  515. }