test_codepage.cpp 17 KB

  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #include <boost/locale/encoding.hpp>
  9. #include <boost/locale/generator.hpp>
  10. #include <boost/locale/localization_backend.hpp>
  11. #include <boost/locale/info.hpp>
  12. #include <boost/locale/config.hpp>
  13. #include <fstream>
  14. #include "test_locale.hpp"
  15. #include "test_locale_tools.hpp"
  17. # ifdef __APPLE__
  18. # include <xlocale.h>
  19. # endif
  20. # include <locale.h>
  21. #endif
  22. #if !defined(BOOST_LOCALE_WITH_ICU) && !defined(BOOST_LOCALE_WITH_ICONV) && (defined(BOOST_WINDOWS) || defined(__CYGWIN__))
  23. #ifndef NOMINMAX
  24. # define NOMINMAX
  25. #endif
  26. #include <windows.h>
  27. #endif
  28. bool test_iso;
  29. bool test_iso_8859_8 = true;
  30. bool test_utf;
  31. bool test_sjis;
  32. std::string he_il_8bit;
  33. std::string en_us_8bit;
  34. std::string ja_jp_shiftjis;
  35. template<typename Char>
  36. std::basic_string<Char> read_file(std::basic_istream<Char> &in)
  37. {
  38. std::basic_string<Char> res;
  39. Char c;
  40. while(in.get(c))
  41. res+=c;
  42. return res;
  43. }
  44. template<typename Char>
  45. void test_ok(std::string file,std::locale const &l,std::basic_string<Char> cmp=std::basic_string<Char>())
  46. {
  47. if(cmp.empty())
  48. cmp=to<Char>(file);
  49. std::ofstream test("testi.txt");
  50. test << file;
  51. test.close();
  52. typedef std::basic_fstream<Char> stream_type;
  53. stream_type f1("testi.txt",stream_type::in);
  54. f1.imbue(l);
  55. TEST(read_file<Char>(f1) == cmp);
  56. f1.close();
  57. stream_type f2("testo.txt",stream_type::out);
  58. f2.imbue(l);
  59. f2 << cmp;
  60. f2.close();
  61. std::ifstream testo("testo.txt");
  62. TEST(read_file<char>(testo) == file);
  63. }
  64. template<typename Char>
  65. void test_rfail(std::string file,std::locale const &l,int pos)
  66. {
  67. std::ofstream test("testi.txt");
  68. test << file;
  69. test.close();
  70. typedef std::basic_fstream<Char> stream_type;
  71. stream_type f1("testi.txt",stream_type::in);
  72. f1.imbue(l);
  73. Char c;
  74. for(int i=0;i<pos;i++) {
  75. f1.get(c);
  76. if(f1.fail()) { // failed before as detected errors at forward;
  77. return;
  78. }
  79. TEST(f1);
  80. }
  81. // if the pos above suceed, at this point
  82. // it MUST fail
  83. TEST(f1.get(c).fail());
  84. }
  85. template<typename Char>
  86. void test_wfail(std::string file,std::locale const &l,int pos)
  87. {
  88. typedef std::basic_fstream<Char> stream_type;
  89. stream_type f1("testo.txt",stream_type::out);
  90. f1.imbue(l);
  91. std::basic_string<Char> out=to<Char>(file);
  92. int i;
  93. for(i=0;i<pos;i++) {
  94. f1 << out.at(i);
  95. f1<<std::flush;
  96. TEST(f1.good());
  97. }
  98. f1 << out.at(i);
  99. TEST(f1.fail() || (f1<<std::flush).fail());
  100. }
  101. template<typename Char>
  102. void test_for_char()
  103. {
  104. boost::locale::generator g;
  105. if(test_utf) {
  106. std::cout << " UTF-8" << std::endl;
  107. test_ok<Char>("grüße\nn i",g("en_US.UTF-8"));
  108. test_rfail<Char>("abc\xFF\xFF",g("en_US.UTF-8"),3);
  109. std::cout << " Testing codepoints above 0xFFFF" << std::endl;
  110. std::cout << " Single U+2008A" << std::endl;
  111. test_ok<Char>("\xf0\xa0\x82\x8a",g("en_US.UTF-8")); // U+2008A
  112. std::cout << " Single U+2008A withing text" << std::endl;
  113. test_ok<Char>("abc\"\xf0\xa0\x82\x8a\"",g("en_US.UTF-8")); // U+2008A
  114. std::string one = "\xf0\xa0\x82\x8a";
  115. std::string res;
  116. for(unsigned i=0;i<1000;i++)
  117. res+=one;
  118. std::cout << " U+2008A x 1000" << std::endl;
  119. test_ok<Char>(res.c_str(),g("en_US.UTF-8")); // U+2008A
  120. }
  121. else {
  122. std::cout << " UTF-8 Not supported " << std::endl;
  123. }
  124. if(test_iso) {
  125. if(test_iso_8859_8) {
  126. std::cout << " ISO8859-8" << std::endl;
  127. test_ok<Char>("hello \xf9\xec\xe5\xed",g(he_il_8bit),to<Char>("hello שלום"));
  128. }
  129. std::cout << " ISO8859-1" << std::endl;
  130. test_ok<Char>(to<char>("grüße\nn i"),g(en_us_8bit),to<Char>("grüße\nn i"));
  131. test_wfail<Char>("grüßen שלום",g(en_us_8bit),7);
  132. }
  133. if(test_sjis) {
  134. std::cout << " Shift-JIS" << std::endl;
  135. test_ok<Char>("\x93\xfa\x96\x7b",g(ja_jp_shiftjis),
  136. boost::locale::conv::to_utf<Char>("\xe6\x97\xa5\xe6\x9c\xac","UTF-8")); // Japan
  137. }
  138. }
  139. void test_wide_io()
  140. {
  141. std::cout << " wchar_t" << std::endl;
  142. test_for_char<wchar_t>();
  143. #if defined BOOST_LOCALE_ENABLE_CHAR16_T && !defined(BOOST_NO_CHAR16_T_CODECVT)
  144. std::cout << " char16_t" << std::endl;
  145. test_for_char<char16_t>();
  146. #endif
  147. #if defined BOOST_LOCALE_ENABLE_CHAR32_T && !defined(BOOST_NO_CHAR32_T_CODECVT)
  148. std::cout << " char32_t" << std::endl;
  149. test_for_char<char32_t>();
  150. #endif
  151. }
  152. template<typename Char>
  153. void test_pos(std::string source,std::basic_string<Char> target,std::string encoding)
  154. {
  155. using namespace boost::locale::conv;
  156. boost::locale::generator g;
  157. std::locale l= encoding == "ISO8859-8" ? g("he_IL."+encoding) : g("en_US."+encoding);
  158. TEST(to_utf<Char>(source,encoding)==target);
  159. TEST(to_utf<Char>(source.c_str(),encoding)==target);
  160. TEST(to_utf<Char>(source.c_str(),source.c_str()+source.size(),encoding)==target);
  161. TEST(to_utf<Char>(source,l)==target);
  162. TEST(to_utf<Char>(source.c_str(),l)==target);
  163. TEST(to_utf<Char>(source.c_str(),source.c_str()+source.size(),l)==target);
  164. TEST(from_utf<Char>(target,encoding)==source);
  165. TEST(from_utf<Char>(target.c_str(),encoding)==source);
  166. TEST(from_utf<Char>(target.c_str(),target.c_str()+target.size(),encoding)==source);
  167. TEST(from_utf<Char>(target,l)==source);
  168. TEST(from_utf<Char>(target.c_str(),l)==source);
  169. TEST(from_utf<Char>(target.c_str(),target.c_str()+target.size(),l)==source);
  170. }
  171. #define TESTF(X) TEST_THROWS(X,boost::locale::conv::conversion_error)
  172. template<typename Char>
  173. void test_to_neg(std::string source,std::basic_string<Char> target,std::string encoding)
  174. {
  175. using namespace boost::locale::conv;
  176. boost::locale::generator g;
  177. std::locale l=g("en_US."+encoding);
  178. TEST(to_utf<Char>(source,encoding)==target);
  179. TEST(to_utf<Char>(source.c_str(),encoding)==target);
  180. TEST(to_utf<Char>(source.c_str(),source.c_str()+source.size(),encoding)==target);
  181. TEST(to_utf<Char>(source,l)==target);
  182. TEST(to_utf<Char>(source.c_str(),l)==target);
  183. TEST(to_utf<Char>(source.c_str(),source.c_str()+source.size(),l)==target);
  184. TESTF(to_utf<Char>(source,encoding,stop));
  185. TESTF(to_utf<Char>(source.c_str(),encoding,stop));
  186. TESTF(to_utf<Char>(source.c_str(),source.c_str()+source.size(),encoding,stop));
  187. TESTF(to_utf<Char>(source,l,stop));
  188. TESTF(to_utf<Char>(source.c_str(),l,stop));
  189. TESTF(to_utf<Char>(source.c_str(),source.c_str()+source.size(),l,stop));
  190. }
  191. template<typename Char>
  192. void test_from_neg(std::basic_string<Char> source,std::string target,std::string encoding)
  193. {
  194. using namespace boost::locale::conv;
  195. boost::locale::generator g;
  196. std::locale l=g("en_US."+encoding);
  197. TEST(from_utf<Char>(source,encoding)==target);
  198. TEST(from_utf<Char>(source.c_str(),encoding)==target);
  199. TEST(from_utf<Char>(source.c_str(),source.c_str()+source.size(),encoding)==target);
  200. TEST(from_utf<Char>(source,l)==target);
  201. TEST(from_utf<Char>(source.c_str(),l)==target);
  202. TEST(from_utf<Char>(source.c_str(),source.c_str()+source.size(),l)==target);
  203. TESTF(from_utf<Char>(source,encoding,stop));
  204. TESTF(from_utf<Char>(source.c_str(),encoding,stop));
  205. TESTF(from_utf<Char>(source.c_str(),source.c_str()+source.size(),encoding,stop));
  206. TESTF(from_utf<Char>(source,l,stop));
  207. TESTF(from_utf<Char>(source.c_str(),l,stop));
  208. TESTF(from_utf<Char>(source.c_str(),source.c_str()+source.size(),l,stop));
  209. }
  210. template<typename Char>
  211. std::basic_string<Char> utf(char const *s)
  212. {
  213. return to<Char>(s);
  214. }
  215. template<>
  216. std::basic_string<char> utf(char const *s)
  217. {
  218. return s;
  219. }
  220. template<typename Char>
  221. void test_with_0()
  222. {
  223. std::string a("abc\0\0 yz\0",3+2+3+1);
  224. TEST(boost::locale::conv::from_utf<Char>(boost::locale::conv::to_utf<Char>(a,"UTF-8"),"UTF-8") == a);
  225. TEST(boost::locale::conv::from_utf<Char>(boost::locale::conv::to_utf<Char>(a,"ISO8859-1"),"ISO8859-1") == a);
  226. }
  227. template<typename Char,int n=sizeof(Char)>
  228. struct utfutf;
  229. template<>
  230. struct utfutf<char,1> {
  231. static char const *ok() {return "grüßen";}
  232. static char const *bad() { return "gr\xFF" "üßen"; }
  233. // split into 2 to make SunCC happy
  234. };
  235. template<>
  236. struct utfutf<wchar_t,2> {
  237. static wchar_t const *ok(){ return L"\x67\x72\xfc\xdf\x65\x6e"; }
  238. static wchar_t const *bad() {
  239. static wchar_t buf[256] = L"\x67\x72\xFF\xfc\xFE\xFD\xdf\x65\x6e";
  240. buf[2]=0xDC01; // second surrogate must not be
  241. buf[4]=0xD801; // First
  242. buf[5]=0xD801; // Must be surrogate trail
  243. return buf;
  244. }
  245. };
  246. template<>
  247. struct utfutf<wchar_t,4> {
  248. static wchar_t const *ok(){ return L"\x67\x72\xfc\xdf\x65\x6e"; }
  249. static wchar_t const *bad() {
  250. static wchar_t buf[256] = L"\x67\x72\xFF\xfc\xdf\x65\x6e";
  251. buf[2]=static_cast<wchar_t>(0x1000000); // > 10FFFF
  252. return buf;
  253. }
  254. };
  255. template<typename CharOut,typename CharIn>
  256. void test_combinations()
  257. {
  258. using boost::locale::conv::utf_to_utf;
  259. typedef utfutf<CharOut> out;
  260. typedef utfutf<CharIn> in;
  261. TEST( (utf_to_utf<CharOut,CharIn>(in::ok())==out::ok()) );
  262. TESTF( (utf_to_utf<CharOut,CharIn>(in::bad(),boost::locale::conv::stop)) );
  263. TEST( (utf_to_utf<CharOut,CharIn>(in::bad())==out::ok()) );
  264. }
  265. void test_all_combinations()
  266. {
  267. std::cout << "Testing utf_to_utf" << std::endl;
  268. std::cout <<" char<-char"<<std::endl;
  269. test_combinations<char,char>();
  270. std::cout <<" char<-wchar"<<std::endl;
  271. test_combinations<char,wchar_t>();
  272. std::cout <<" wchar<-char"<<std::endl;
  273. test_combinations<wchar_t,char>();
  274. std::cout <<" wchar<-wchar"<<std::endl;
  275. test_combinations<wchar_t,wchar_t>();
  276. }
  277. template<typename Char>
  278. void test_to()
  279. {
  280. test_pos<Char>(to<char>("grüßen"),utf<Char>("grüßen"),"ISO8859-1");
  281. if(test_iso_8859_8)
  282. test_pos<Char>("\xf9\xec\xe5\xed",utf<Char>("שלום"),"ISO8859-8");
  283. test_pos<Char>("grüßen",utf<Char>("grüßen"),"UTF-8");
  284. test_pos<Char>("abc\"\xf0\xa0\x82\x8a\"",utf<Char>("abc\"\xf0\xa0\x82\x8a\""),"UTF-8");
  285. test_to_neg<Char>("g\xFFrüßen",utf<Char>("grüßen"),"UTF-8");
  286. test_from_neg<Char>(utf<Char>("hello שלום"),"hello ","ISO8859-1");
  287. test_with_0<Char>();
  288. }
  289. void test_skip(char const *enc,char const *utf,char const *name,char const *opt=0)
  290. {
  291. if(opt!=0) {
  292. if(boost::locale::conv::to_utf<char>(enc,name) == opt) {
  293. test_skip(enc,opt,name);
  294. return;
  295. }
  296. }
  297. TEST(boost::locale::conv::to_utf<char>(enc,name) == utf);
  298. TEST(boost::locale::conv::to_utf<wchar_t>(enc,name) == boost::locale::conv::utf_to_utf<wchar_t>(utf));
  300. TEST(boost::locale::conv::to_utf<char16_t>(enc,name) == boost::locale::conv::utf_to_utf<char16_t>(utf));
  301. #endif
  303. TEST(boost::locale::conv::to_utf<char32_t>(enc,name) == boost::locale::conv::utf_to_utf<char32_t>(utf));
  304. #endif
  305. }
  306. void test_simple_conversions()
  307. {
  308. namespace blc=boost::locale::conv;
  309. std::cout << "- Testing correct invalid bytes skipping" << std::endl;
  310. try {
  311. std::cout << "-- ISO-8859-8" << std::endl;
  312. test_skip("test \xE0\xE1\xFB-","test \xd7\x90\xd7\x91-","ISO-8859-8");
  313. test_skip("\xFB","","ISO-8859-8");
  314. test_skip("test \xE0\xE1\xFB","test \xd7\x90\xd7\x91","ISO-8859-8");
  315. test_skip("\xFB-","-","ISO-8859-8");
  316. }
  317. catch(blc::invalid_charset_error const &) {
  318. std::cout <<"--- not supported" << std::endl;
  319. }
  320. try {
  321. std::cout << "-- cp932" << std::endl;
  322. test_skip("test\xE0\xA0 \x83\xF8-","test\xe7\x87\xbf -","cp932","test\xe7\x87\xbf ");
  323. test_skip("\x83\xF8","","cp932");
  324. test_skip("test\xE0\xA0 \x83\xF8","test\xe7\x87\xbf ","cp932");
  325. test_skip("\x83\xF8-","-","cp932","");
  326. }
  327. catch(blc::invalid_charset_error const &) {
  328. std::cout <<"--- not supported" << std::endl;
  329. }
  330. }
  331. int main()
  332. {
  333. try {
  334. std::vector<std::string> def;
  336. def.push_back("icu");
  337. #endif
  339. def.push_back("std");
  340. #endif
  342. def.push_back("winapi");
  343. #endif
  345. def.push_back("posix");
  346. #endif
  347. #if !defined(BOOST_LOCALE_WITH_ICU) && !defined(BOOST_LOCALE_WITH_ICONV) && (defined(BOOST_WINDOWS) || defined(__CYGWIN__))
  348. test_iso_8859_8 = IsValidCodePage(28598)!=0;
  349. #endif
  350. test_simple_conversions();
  351. for(int type = 0; type < int(def.size()); type ++ ) {
  352. boost::locale::localization_backend_manager tmp_backend = boost::locale::localization_backend_manager::global();
  353. tmp_backend.select(def[type]);
  354. boost::locale::localization_backend_manager::global(tmp_backend);
  355. std::string bname = def[type];
  356. if(bname=="std") {
  357. en_us_8bit = get_std_name("en_US.ISO8859-1");
  358. he_il_8bit = get_std_name("he_IL.ISO8859-8");
  359. ja_jp_shiftjis = get_std_name("ja_JP.SJIS");
  360. if(!ja_jp_shiftjis.empty() && !test_std_supports_SJIS_codecvt(ja_jp_shiftjis))
  361. {
  362. std::cout << "Warning: detected unproper support of " << ja_jp_shiftjis << " locale, disableling it" << std::endl;
  363. ja_jp_shiftjis = "";
  364. }
  365. }
  366. else {
  367. en_us_8bit = "en_US.ISO8859-1";
  368. he_il_8bit = "he_IL.ISO8859-8";
  369. ja_jp_shiftjis = "ja_JP.SJIS";
  370. }
  371. std::cout << "Testing for backend " << def[type] << std::endl;
  372. test_iso = true;
  373. if(bname=="std" && (he_il_8bit.empty() || en_us_8bit.empty())) {
  374. std::cout << "no iso locales availible, passing" << std::endl;
  375. test_iso = false;
  376. }
  377. test_sjis = true;
  378. if(bname=="std" && ja_jp_shiftjis.empty()) {
  379. test_sjis = false;
  380. }
  381. if(bname=="winapi") {
  382. test_iso = false;
  383. test_sjis = false;
  384. }
  385. test_utf = true;
  387. if(bname=="posix") {
  388. {
  389. locale_t l = newlocale(LC_ALL_MASK,he_il_8bit.c_str(),0);
  390. if(!l)
  391. test_iso = false;
  392. else
  393. freelocale(l);
  394. }
  395. {
  396. locale_t l = newlocale(LC_ALL_MASK,en_us_8bit.c_str(),0);
  397. if(!l)
  398. test_iso = false;
  399. else
  400. freelocale(l);
  401. }
  402. {
  403. locale_t l = newlocale(LC_ALL_MASK,"en_US.UTF-8",0);
  404. if(!l)
  405. test_utf = false;
  406. else
  407. freelocale(l);
  408. }
  410. {
  411. locale_t l = newlocale(LC_ALL_MASK,ja_jp_shiftjis.c_str(),0);
  412. if(!l)
  413. test_sjis = false;
  414. else
  415. freelocale(l);
  416. }
  417. #else
  418. test_sjis = false;
  419. #endif
  420. }
  421. #endif
  422. if(def[type]=="std" && (get_std_name("en_US.UTF-8").empty() || get_std_name("he_IL.UTF-8").empty()))
  423. {
  424. test_utf = false;
  425. }
  426. std::cout << "Testing wide I/O" << std::endl;
  427. test_wide_io();
  428. std::cout << "Testing charset to/from UTF conversion functions" << std::endl;
  429. std::cout << " char" << std::endl;
  430. test_to<char>();
  431. std::cout << " wchar_t" << std::endl;
  432. test_to<wchar_t>();
  434. if(bname == "icu" || bname == "std") {
  435. std::cout << " char16_t" << std::endl;
  436. test_to<char16_t>();
  437. }
  438. #endif
  440. if(bname == "icu" || bname == "std") {
  441. std::cout << " char32_t" << std::endl;
  442. test_to<char32_t>();
  443. }
  444. #endif
  445. test_all_combinations();
  446. }
  447. }
  448. catch(std::exception const &e) {
  449. std::cerr << "Failed " << e.what() << std::endl;
  450. return EXIT_FAILURE;
  451. }
  452. FINALIZE();
  453. }
  454. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4
  455. // boostinspect:noascii