123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302 |
- /////////1/////////2/////////3/////////4/////////5/////////6/////////7/////////8
- // test_utf8_codecvt.cpp
- // (C) Copyright 2002-4 Robert Ramey - http://www.rrsd.com .
- // Use, modification and distribution is subject to the Boost Software
- // License, Version 1.0. (See accompanying file LICENSE_1_0.txt or copy at
- // http://www.boost.org/LICENSE_1_0.txt)
- #include <algorithm> // std::copy
- #include <fstream>
- #include <iostream>
- #include <iterator>
- #include <locale>
- #include <vector>
- #include <string>
- #include <cstddef> // size_t
- #include <cwchar>
- #include <boost/config.hpp>
- #include <boost/core/no_exceptions_support.hpp>
- #define BOOST_UTF8_BEGIN_NAMESPACE namespace boost { namespace detail {
- #define BOOST_UTF8_END_NAMESPACE } }
- #include <boost/detail/utf8_codecvt_facet.hpp>
- #include <boost/detail/utf8_codecvt_facet.ipp>
- #if defined(BOOST_NO_STDC_NAMESPACE)
- namespace std{
- using ::size_t;
- using ::wcslen;
- #if !defined(UNDER_CE) && !defined(__PGIC__)
- using ::w_int;
- #endif
- } // namespace std
- #endif
- // Note: copied from boost/iostreams/char_traits.hpp
- //
- // Dinkumware that comes with QNX Momentics 6.3.0, 4.0.2, incorrectly defines
- // the EOF and WEOF macros to not std:: qualify the wint_t type (and so does
- // Sun C++ 5.8 + STLport 4). Fix by placing the def in this scope.
- // NOTE: Use BOOST_WORKAROUND?
- #if (defined(__QNX__) && defined(BOOST_DINKUMWARE_STDLIB)) \
- || defined(__SUNPRO_CC)
- using ::std::wint_t;
- #endif
- #include <boost/core/lightweight_test.hpp>
- template<std::size_t s>
- struct test_data
- {
- static unsigned char utf8_encoding[];
- static wchar_t wchar_encoding[];
- };
- template<>
- unsigned char test_data<2>::utf8_encoding[] = {
- 0x01,
- 0x7f,
- 0xc2, 0x80,
- 0xdf, 0xbf,
- 0xe0, 0xa0, 0x80,
- 0xe7, 0xbf, 0xbf
- };
- template<>
- wchar_t test_data<2>::wchar_encoding[] = {
- 0x0001,
- 0x007f,
- 0x0080,
- 0x07ff,
- 0x0800,
- 0x7fff
- };
- template<>
- unsigned char test_data<4>::utf8_encoding[] = {
- 0x01,
- 0x7f,
- 0xc2, 0x80,
- 0xdf, 0xbf,
- 0xe0, 0xa0, 0x80,
- 0xef, 0xbf, 0xbf,
- 0xf0, 0x90, 0x80, 0x80,
- 0xf4, 0x8f, 0xbf, 0xbf,
- /* codecvt implementations for clang and gcc don't handle more than 21 bits and
- * return eof accordlingly. So don't test the whole 32 range
- */
- /*
- 0xf7, 0xbf, 0xbf, 0xbf,
- 0xf8, 0x88, 0x80, 0x80, 0x80,
- 0xfb, 0xbf, 0xbf, 0xbf, 0xbf,
- 0xfc, 0x84, 0x80, 0x80, 0x80, 0x80,
- 0xfd, 0xbf, 0xbf, 0xbf, 0xbf, 0xbf
- */
- };
- template<>
- wchar_t test_data<4>::wchar_encoding[] = {
- (wchar_t)0x00000001,
- (wchar_t)0x0000007f,
- (wchar_t)0x00000080,
- (wchar_t)0x000007ff,
- (wchar_t)0x00000800,
- (wchar_t)0x0000ffff,
- (wchar_t)0x00010000,
- (wchar_t)0x0010ffff,
- /* codecvt implementations for clang and gcc don't handle more than 21 bits and
- * return eof accordlingly. So don't test the whole 32 range
- */
- /*
- (wchar_t)0x001fffff,
- (wchar_t)0x00200000,
- (wchar_t)0x03ffffff,
- (wchar_t)0x04000000,
- (wchar_t)0x7fffffff
- */
- };
- int
- test_main(int /* argc */, char * /* argv */[]) {
- std::locale utf8_locale
- = std::locale(
- std::locale::classic(),
- new boost::detail::utf8_codecvt_facet
- );
- typedef char utf8_t;
- // define test data compatible with the wchar_t implementation
- // as either ucs-2 or ucs-4 depending on the compiler/library.
- typedef test_data<sizeof(wchar_t)> td;
- // Send our test UTF-8 data to file
- {
- std::ofstream ofs;
- ofs.open("test.dat");
- std::copy(
- td::utf8_encoding,
- td::utf8_encoding + sizeof(td::utf8_encoding) / sizeof(unsigned char),
- std::ostream_iterator<utf8_t>(ofs)
- );
- }
- // Read the test data back in, converting to UCS-4 on the way in
- std::vector<wchar_t> from_file;
- {
- std::wifstream ifs;
- ifs.imbue(utf8_locale);
- ifs.open("test.dat");
- std::wint_t item = 0;
- // note can't use normal vector from iterator constructor because
- // dinkumware doesn't have it.
- for(;;){
- item = ifs.get();
- if(item == WEOF)
- break;
- //ifs >> item;
- //if(ifs.eof())
- // break;
- from_file.push_back(item);
- }
- }
- BOOST_TEST(std::equal(from_file.begin(), from_file.end(), td::wchar_encoding));
-
- // Send the UCS4_data back out, converting to UTF-8
- {
- std::wofstream ofs;
- ofs.imbue(utf8_locale);
- ofs.open("test2.dat");
- std::copy(
- from_file.begin(),
- from_file.end(),
- std::ostream_iterator<wchar_t, wchar_t>(ofs)
- );
- }
- // Make sure that both files are the same
- {
- typedef std::istream_iterator<utf8_t> is_iter;
- is_iter end_iter;
- std::ifstream ifs1("test.dat");
- is_iter it1(ifs1);
- std::vector<utf8_t> data1;
- std::copy(it1, end_iter, std::back_inserter(data1));
- std::ifstream ifs2("test2.dat");
- is_iter it2(ifs2);
- std::vector<utf8_t> data2;
- std::copy(it2, end_iter, std::back_inserter(data2));
- BOOST_TEST(data1 == data2);
- }
- // some libraries have trouble that only shows up with longer strings
-
- const wchar_t * test3_data = L"\
- <?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"yes\" ?>\
- <!DOCTYPE boost_serialization>\
- <boost_serialization signature=\"serialization::archive\" version=\"3\">\
- <a class_id=\"0\" tracking_level=\"0\">\
- <b>1</b>\
- <f>96953204</f>\
- <g>177129195</g>\
- <l>1</l>\
- <m>5627</m>\
- <n>23010</n>\
- <o>7419</o>\
- <p>16212</p>\
- <q>4086</q>\
- <r>2749</r>\
- <c>-33</c>\
- <s>124</s>\
- <t>28</t>\
- <u>32225</u>\
- <v>17543</v>\
- <w>0.84431422</w>\
- <x>1.0170664757130923</x>\
- <y>tjbx</y>\
- <z>cuwjentqpkejp</z>\
- </a>\
- </boost_serialization>\
- ";
-
- // Send the UCS4_data back out, converting to UTF-8
- std::size_t l = std::wcslen(test3_data);
- {
- std::wofstream ofs;
- ofs.imbue(utf8_locale);
- ofs.open("test3.dat");
- std::copy(
- test3_data,
- test3_data + l,
- std::ostream_iterator<wchar_t, wchar_t>(ofs)
- );
- }
- // Make sure that both files are the same
- {
- std::wifstream ifs;
- ifs.imbue(utf8_locale);
- ifs.open("test3.dat");
- ifs >> std::noskipws;
- BOOST_TEST(
- std::equal(
- test3_data,
- test3_data + l,
- std::istream_iterator<wchar_t, wchar_t>(ifs)
- )
- );
- }
- // Test length calculation
- {
- std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
- std::mbstate_t mbs = std::mbstate_t();
- const int utf8_len = sizeof(td::utf8_encoding) / sizeof(*td::utf8_encoding);
- int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + utf8_len), ~static_cast< std::size_t >(0u));
- BOOST_TEST_EQ(utf8_len, res);
- }
- // Test that length calculation detects character boundaries
- {
- std::codecvt<wchar_t, char, std::mbstate_t> const& fac = std::use_facet< std::codecvt<wchar_t, char, std::mbstate_t> >(utf8_locale);
- std::mbstate_t mbs = std::mbstate_t();
- // The first 5 bytes of utf8_encoding contain 3 complete UTF-8 characters (taking 4 bytes in total) and 1 byte of an incomplete character.
- // This last byte should not be accounted by length().
- const int input_len = 5;
- const int utf8_len = 4;
- int res = fac.length(mbs, reinterpret_cast< const char* >(td::utf8_encoding), reinterpret_cast< const char* >(td::utf8_encoding + input_len), ~static_cast< std::size_t >(0u));
- BOOST_TEST_EQ(utf8_len, res);
- }
- return EXIT_SUCCESS;
- }
- int
- main(int argc, char * argv[]){
- int retval = 1;
- BOOST_TRY{
- retval = test_main(argc, argv);
- }
- #ifndef BOOST_NO_EXCEPTION_STD_NAMESPACE
- BOOST_CATCH(const std::exception & e){
- BOOST_ERROR(e.what());
- }
- #endif
- BOOST_CATCH(...){
- BOOST_ERROR("failed with uncaught exception:");
- }
- BOOST_CATCH_END
- int error_count = boost::report_errors();
- if(error_count > 0)
- retval = error_count;
- return retval;
- }
|