123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584 |
- /*=============================================================================
- Copyright (c) 2001-2011 Joel de Guzman
- Distributed under the Boost Software License, Version 1.0. (See accompanying
- file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
- =============================================================================*/
- #include <boost/config/warning_disable.hpp>
- #include <boost/spirit/include/qi.hpp>
- #include <boost/spirit/include/phoenix.hpp>
- #include <boost/unordered_map.hpp>
- #include <boost/algorithm/string/trim.hpp>
- #include <boost/cstdint.hpp>
- #include <boost/foreach.hpp>
- #include <boost/array.hpp>
- #include <boost/scoped_array.hpp>
- #include <boost/range/iterator_range.hpp>
- #include <iostream>
- #include <iomanip>
- #include <fstream>
- #include <vector>
- #include <algorithm>
- #include <string>
- #include <map>
- // We place the data here. Each line comprises various fields
- typedef std::vector<std::string> ucd_line;
- typedef std::vector<ucd_line> ucd_vector;
- typedef std::vector<ucd_line>::iterator ucd_iterator;
- // spirit and phoenix using declarations
- using boost::spirit::qi::parse;
- using boost::spirit::qi::hex;
- using boost::spirit::qi::char_;
- using boost::spirit::qi::eol;
- using boost::spirit::qi::rule;
- using boost::spirit::qi::omit;
- using boost::spirit::qi::_1;
- using boost::spirit::qi::_val;
- using boost::phoenix::push_back;
- using boost::phoenix::ref;
- // basic unsigned types
- using boost::uint8_t;
- using boost::uint16_t;
- using boost::uint32_t;
- // a char range
- struct ucd_range
- {
- ucd_range(uint32_t start, uint32_t finish)
- : start(start), finish(finish) {}
- // we need this so we can use ucd_range as a multimap key
- friend bool operator<(ucd_range const& a, ucd_range const& b)
- {
- return a.start < b.start;
- }
- uint32_t start;
- uint32_t finish;
- };
- class ucd_info
- {
- public:
- ucd_info(char const* filename)
- {
- std::ifstream in(filename, std::ios_base::in);
- if (!in)
- {
- std::cerr << "Error: Could not open input file: "
- << filename << std::endl;
- }
- else
- {
- std::string data; // We will read the contents here.
- in.unsetf(std::ios::skipws); // No white space skipping!
- std::copy(
- std::istream_iterator<char>(in),
- std::istream_iterator<char>(),
- std::back_inserter(data));
- typedef std::string::const_iterator iterator_type;
- iterator_type f = data.begin();
- iterator_type l = data.end();
- rule<iterator_type> endl = -('#' >> *(char_-eol)) >> eol;
- rule<iterator_type, std::string()> field = *(char_-(';'|endl)) >> (';'|&endl);
- rule<iterator_type, ucd_line()> line = +(field-endl) >> endl;
- rule<iterator_type, std::vector<ucd_line>()> file = +(endl | line[push_back(_val, _1)]);
- parse(f, l, file, info);
- }
- }
- template <typename Array>
- void collect(Array& data, int field, bool collect_properties = true) const
- {
- BOOST_ASSERT(!info.empty());
- ucd_vector::const_iterator current = info.begin();
- ucd_vector::const_iterator end = info.end();
- while (current != end)
- {
- std::string range = (*current)[0];
- boost::trim(range);
- std::string::const_iterator f = range.begin();
- std::string::const_iterator l = range.end();
- // get the code-point range
- uint32_t start;
- uint32_t finish;
- parse(f, l, hex[ref(start) = ref(finish) = _1] >> -(".." >> hex[ref(finish) = _1]));
- // special case for UnicodeData.txt ranges:
- if ((*current)[1].find("First>") != std::string::npos)
- {
- ++current;
- BOOST_ASSERT(current != end);
- BOOST_ASSERT((*current)[1].find("Last>") != std::string::npos);
- std::string range = (*current)[0];
- boost::trim(range);
- f = range.begin();
- l = range.end();
- parse(f, l, hex[ref(finish) = _1]);
- }
- std::string code;
- if (field < int(current->size()))
- code = (*current)[field];
- boost::trim(code);
- // Only collect properties we are interested in
- if (collect_properties) // code for properties
- {
- if (!ignore_property(code))
- {
- for (uint32_t i = start; i <= finish; ++i)
- data[i] |= map_property(code);
- }
- }
- else // code for actual numeric values
- {
- for (uint32_t i = start; i <= finish; ++i)
- {
- if (code.empty())
- {
- data[i] = 0; // signal that this code maps to itself
- }
- else
- {
- f = code.begin();
- l = code.end();
- parse(f, l, hex, data[i]);
- }
- }
- }
- ++current;
- }
- }
- private:
- static bool ignore_property(std::string const& p)
- {
- // We don't handle all properties
- std::map<std::string, int>& pm = get_property_map();
- std::map<std::string, int>::iterator i = pm.find(p);
- return i == pm.end();
- }
- static int
- map_property(std::string const& p)
- {
- std::map<std::string, int>& pm = get_property_map();
- std::map<std::string, int>::iterator i = pm.find(p);
- BOOST_ASSERT(i != pm.end());
- return i->second;
- }
- static std::map<std::string, int>&
- get_property_map()
- {
- // The properties we are interested in:
- static std::map<std::string, int> map;
- if (map.empty())
- {
- // General_Category
- map["Lu"] = 0;
- map["Ll"] = 1;
- map["Lt"] = 2;
- map["Lm"] = 3;
- map["Lo"] = 4;
- map["Mn"] = 8;
- map["Me"] = 9;
- map["Mc"] = 10;
- map["Nd"] = 16;
- map["Nl"] = 17;
- map["No"] = 18;
- map["Zs"] = 24;
- map["Zl"] = 25;
- map["Zp"] = 26;
- map["Cc"] = 32;
- map["Cf"] = 33;
- map["Co"] = 34;
- map["Cs"] = 35;
- map["Cn"] = 36;
- map["Pd"] = 40;
- map["Ps"] = 41;
- map["Pe"] = 42;
- map["Pc"] = 43;
- map["Po"] = 44;
- map["Pi"] = 45;
- map["Pf"] = 46;
- map["Sm"] = 48;
- map["Sc"] = 49;
- map["Sk"] = 50;
- map["So"] = 51;
- // Derived Properties.
- map["Alphabetic"] = 64;
- map["Uppercase"] = 128;
- map["Lowercase"] = 256;
- map["White_Space"] = 512;
- map["Hex_Digit"] = 1024;
- map["Noncharacter_Code_Point"] = 2048;
- map["Default_Ignorable_Code_Point"] = 4096;
- // Script
- map["Arabic"] = 0;
- map["Imperial_Aramaic"] = 1;
- map["Armenian"] = 2;
- map["Avestan"] = 3;
- map["Balinese"] = 4;
- map["Bamum"] = 5;
- map["Bengali"] = 6;
- map["Bopomofo"] = 7;
- map["Braille"] = 8;
- map["Buginese"] = 9;
- map["Buhid"] = 10;
- map["Canadian_Aboriginal"] = 11;
- map["Carian"] = 12;
- map["Cham"] = 13;
- map["Cherokee"] = 14;
- map["Coptic"] = 15;
- map["Cypriot"] = 16;
- map["Cyrillic"] = 17;
- map["Devanagari"] = 18;
- map["Deseret"] = 19;
- map["Egyptian_Hieroglyphs"] = 20;
- map["Ethiopic"] = 21;
- map["Georgian"] = 22;
- map["Glagolitic"] = 23;
- map["Gothic"] = 24;
- map["Greek"] = 25;
- map["Gujarati"] = 26;
- map["Gurmukhi"] = 27;
- map["Hangul"] = 28;
- map["Han"] = 29;
- map["Hanunoo"] = 30;
- map["Hebrew"] = 31;
- map["Hiragana"] = 32;
- map["Katakana_Or_Hiragana"] = 33;
- map["Old_Italic"] = 34;
- map["Javanese"] = 35;
- map["Kayah_Li"] = 36;
- map["Katakana"] = 37;
- map["Kharoshthi"] = 38;
- map["Khmer"] = 39;
- map["Kannada"] = 40;
- map["Kaithi"] = 41;
- map["Tai_Tham"] = 42;
- map["Lao"] = 43;
- map["Latin"] = 44;
- map["Lepcha"] = 45;
- map["Limbu"] = 46;
- map["Linear_B"] = 47;
- map["Lisu"] = 48;
- map["Lycian"] = 49;
- map["Lydian"] = 50;
- map["Malayalam"] = 51;
- map["Mongolian"] = 52;
- map["Meetei_Mayek"] = 53;
- map["Myanmar"] = 54;
- map["Nko"] = 55;
- map["Ogham"] = 56;
- map["Ol_Chiki"] = 57;
- map["Old_Turkic"] = 58;
- map["Oriya"] = 59;
- map["Osmanya"] = 60;
- map["Phags_Pa"] = 61;
- map["Inscriptional_Pahlavi"] = 62;
- map["Phoenician"] = 63;
- map["Inscriptional_Parthian"] = 64;
- map["Rejang"] = 65;
- map["Runic"] = 66;
- map["Samaritan"] = 67;
- map["Old_South_Arabian"] = 68;
- map["Saurashtra"] = 69;
- map["Shavian"] = 70;
- map["Sinhala"] = 71;
- map["Sundanese"] = 72;
- map["Syloti_Nagri"] = 73;
- map["Syriac"] = 74;
- map["Tagbanwa"] = 75;
- map["Tai_Le"] = 76;
- map["New_Tai_Lue"] = 77;
- map["Tamil"] = 78;
- map["Tai_Viet"] = 79;
- map["Telugu"] = 80;
- map["Tifinagh"] = 81;
- map["Tagalog"] = 82;
- map["Thaana"] = 83;
- map["Thai"] = 84;
- map["Tibetan"] = 85;
- map["Ugaritic"] = 86;
- map["Vai"] = 87;
- map["Old_Persian"] = 88;
- map["Cuneiform"] = 89;
- map["Yi"] = 90;
- map["Inherited"] = 91;
- map["Common"] = 92;
- map["Unknown"] = 93;
- }
- return map;
- }
- ucd_vector info;
- };
- template <typename T, uint32_t block_size_ = 256>
- class ucd_table_builder
- {
- public:
- static uint32_t const block_size = block_size_;
- static uint32_t const full_span = 0x110000;
- typedef T value_type;
- ucd_table_builder() : p(new T[full_span])
- {
- for (uint32_t i = 0; i < full_span; ++i)
- p[i] = 0;
- }
- void collect(char const* filename, int field, bool collect_properties = true)
- {
- std::cout << "collecting " << filename << std::endl;
- ucd_info info(filename);
- info.collect(p, field, collect_properties);
- }
- void build(std::vector<uint8_t>& stage1, std::vector<T const*>& stage2)
- {
- std::cout << "building tables" << std::endl;
- std::map<block_ptr, std::vector<T const*> > blocks;
- for (T const* i = p.get(); i < (p.get() + full_span); i += block_size)
- blocks[block_ptr(i)].push_back(i);
- // Not enough bits to store the block indices.
- BOOST_ASSERT(blocks.size() < (1 << (sizeof(uint8_t) * 8)));
- typedef std::pair<block_ptr, std::vector<T const*> > blocks_value_type;
- std::map<T const*, std::vector<T const*> > sorted_blocks;
- BOOST_FOREACH(blocks_value_type const& val, blocks)
- {
- sorted_blocks[val.first.p] = val.second;
- }
- stage1.clear();
- stage1.reserve(full_span / block_size);
- stage1.resize(full_span / block_size);
- stage2.clear();
- stage2.reserve(blocks.size());
- typedef std::pair<T const*, std::vector<T const*> > sorted_blocks_value_type;
- BOOST_FOREACH(sorted_blocks_value_type const& val, sorted_blocks)
- {
- stage2.push_back(val.first);
- BOOST_FOREACH(T const* val2, val.second)
- {
- stage1[(val2 - p.get()) / block_size] = stage2.size() - 1;
- }
- }
- }
- private:
- struct block_ptr
- {
- block_ptr(T const* p) : p(p) {}
- friend bool operator<(block_ptr a, block_ptr b)
- {
- return std::lexicographical_compare(
- a.p, a.p + block_size, b.p, b.p + block_size);
- }
- T const* p;
- };
- boost::scoped_array<T> p;
- };
- template <typename Out>
- void print_tab(Out& out, int tab)
- {
- for (int i = 0; i < tab; ++i)
- out << ' ';
- }
- template <typename Out, typename C>
- void print_table(Out& out, C const& c, bool trailing_comma, int width = 4, int group = 16)
- {
- int const tab = 4;
- typename C::size_type size = c.size();
- BOOST_ASSERT(size > 1);
- print_tab(out, tab);
- out << std::setw(width) << int(c[0]);
- for (C::size_type i = 1; i < size; ++i)
- {
- out << ", ";
- if ((i % group) == 0)
- {
- out << std::endl;
- print_tab(out, tab);
- }
- out << std::setw(width) << int(c[i]);
- }
- if (trailing_comma)
- out << ", " << std::endl;
- }
- template <typename Out>
- void print_head(Out& out)
- {
- out
- << "/*=============================================================================\n"
- << " Copyright (c) 2001-2011 Joel de Guzman\n"
- << "\n"
- << " Distributed under the Boost Software License, Version 1.0. (See accompanying\n"
- << " file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)\n"
- << "\n"
- << " AUTOGENERATED. DO NOT EDIT!!!\n"
- << "==============================================================================*/\n"
- << "#include <boost/cstdint.hpp>\n"
- << "\n"
- << "namespace boost { namespace spirit { namespace ucd { namespace detail\n"
- << "{"
- ;
- }
- template <typename Out>
- void print_tail(Out& out)
- {
- out
- << "\n"
- << "}}}} // namespace boost::spirit::unicode::detail\n"
- ;
- }
- char const* get_int_type_name(int size)
- {
- switch (size)
- {
- case 1: return "::boost::uint8_t";
- case 2: return "::boost::uint16_t";
- case 4: return "::boost::uint32_t";
- case 5: return "::boost::uint64_t";
- default: BOOST_ASSERT(false); return 0; // invalid size
- };
- }
- template <typename Out, typename Builder>
- void print_file(Out& out, Builder& builder, int field_width, char const* name)
- {
- std::cout << "Generating " << name << " tables" << std::endl;
- uint32_t const block_size = Builder::block_size;
- typedef typename Builder::value_type value_type;
- print_head(out);
- std::vector<uint8_t> stage1;
- std::vector<value_type const*> stage2;
- builder.build(stage1, stage2);
- std::cout << "Block Size: " << block_size << std::endl;
- std::cout << "Total Bytes: "
- << stage1.size()+(stage2.size()*block_size*sizeof(value_type))
- << std::endl;
- out
- << "\n"
- << " static const ::boost::uint8_t " << name << "_stage1[] = {\n"
- << "\n"
- ;
- print_table(out, stage1, false, 3);
- char const* int_name = get_int_type_name(sizeof(value_type));
- out
- << "\n"
- << " };"
- << "\n"
- << "\n"
- << " static const " << int_name << ' ' << name << "_stage2[] = {"
- ;
- int block_n = 0;
- for (int i = 0; i < int(stage2.size()); ++i)
- {
- value_type const* p = stage2[i];
- bool last = (i+1 == stage2.size());
- out << "\n\n // block " << block_n++ << std::endl;
- print_table(out,
- boost::iterator_range<value_type const*>(p, p+block_size), !last, field_width);
- }
- out
- << "\n"
- << " };"
- << "\n"
- ;
- out
- << "\n"
- << " inline " << int_name << ' ' << name << "_lookup(::boost::uint32_t ch)\n"
- << " {\n"
- << " ::boost::uint32_t block_offset = " << name << "_stage1[ch / " << block_size << "] * " << block_size << ";\n"
- << " return " << name << "_stage2[block_offset + ch % " << block_size << "];\n"
- << " }\n"
- ;
- print_tail(out);
- }
- int main()
- {
- // The category tables
- {
- std::ofstream out("category_table.hpp");
- ucd_table_builder<uint16_t, 256> builder;
- builder.collect("UnicodeData.txt", 2);
- builder.collect("DerivedCoreProperties.txt", 1);
- builder.collect("PropList.txt", 1);
- print_file(out, builder, 4, "category");
- }
- // The script tables
- {
- std::ofstream out("script_table.hpp");
- ucd_table_builder<uint8_t, 256> builder;
- builder.collect("Scripts.txt", 1);
- print_file(out, builder, 3, "script");
- }
- // The lowercase tables
- {
- std::ofstream out("lowercase_table.hpp");
- ucd_table_builder<uint32_t, 256> builder;
- builder.collect("UnicodeData.txt", 13, false);
- print_file(out, builder, 6, "lowercase");
- }
- // The uppercase tables
- {
- std::ofstream out("uppercase_table.hpp");
- ucd_table_builder<uint32_t, 256> builder;
- builder.collect("UnicodeData.txt", 12, false);
- print_file(out, builder, 6, "uppercase");
- }
- return 0;
- }
|