test_codecvt.cpp 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. //
  2. // Copyright (c) 2015 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #include <boost/locale/utf8_codecvt.hpp>
  9. #include <locale>
  10. #include <iostream>
  11. #include <iomanip>
  12. #include <string.h>
  13. #include <wchar.h>
  14. #include <memory.h>
  15. #define BOOST_LOCALE_ERROR_LIMIT -1
  16. #include "test_locale.hpp"
  17. static char const *utf8_name = "\xf0\x9d\x92\x9e-\xD0\xBF\xD1\x80\xD0\xB8\xD0\xB2\xD0\xB5\xD1\x82-\xE3\x82\x84\xE3\x81\x82.txt";
  18. static wchar_t const *wide_name = L"\U0001D49E-\u043F\u0440\u0438\u0432\u0435\u0442-\u3084\u3042.txt";
  19. char const *res(std::codecvt_base::result r)
  20. {
  21. switch(r){
  22. case std::codecvt_base::ok: return "ok";
  23. case std::codecvt_base::partial: return "partial";
  24. case std::codecvt_base::error: return "error";
  25. case std::codecvt_base::noconv: return "noconv";
  26. default:
  27. return "error";
  28. }
  29. }
  30. typedef std::codecvt<wchar_t,char,std::mbstate_t> cvt_type;
  31. void test_codecvt_in_n_m(cvt_type const &cvt,int n,int m)
  32. {
  33. wchar_t const *wptr = wide_name;
  34. int wlen = wcslen(wide_name);
  35. int u8len = strlen(utf8_name);
  36. char const *from = utf8_name;
  37. char const *end = from;
  38. char const *real_end = utf8_name + u8len;
  39. char const *from_next = from;
  40. std::mbstate_t mb=std::mbstate_t();
  41. while(from_next < real_end) {
  42. if(from == end) {
  43. end = from + n;
  44. if(end > real_end)
  45. end = real_end;
  46. }
  47. wchar_t buf[128];
  48. wchar_t *to = buf;
  49. wchar_t *to_end = to + m;
  50. wchar_t *to_next = to;
  51. std::mbstate_t mb2 = mb;
  52. std::codecvt_base::result r = cvt.in(mb,from,end,from_next,to,to_end,to_next);
  53. //std::cout << "In from_size=" << (end-from) << " from move=" << (from_next - from) << " to move= " << to_next - to << " state = " << res(r) << std::endl;
  54. int count = cvt.length(mb2,from,end,to_end - to);
  55. #ifndef BOOST_LOCALE_DO_LENGTH_MBSTATE_CONST
  56. TEST(memcmp(&mb,&mb2,sizeof(mb))==0);
  57. if(count != from_next - from) {
  58. std::cout << count << " " << from_next - from << std::endl;
  59. }
  60. TEST(count == from_next - from);
  61. #else
  62. TEST(count == to_next - to);
  63. #endif
  64. if(r == cvt_type::partial) {
  65. end+=n;
  66. if(end > real_end)
  67. end = real_end;
  68. }
  69. else
  70. TEST(r == cvt_type::ok);
  71. while(to!=to_next) {
  72. TEST(*wptr == *to);
  73. wptr++;
  74. to++;
  75. }
  76. to=to_next;
  77. from = from_next;
  78. }
  79. TEST(wptr == wide_name + wlen);
  80. TEST(from == real_end);
  81. }
  82. void test_codecvt_out_n_m(cvt_type const &cvt,int n,int m)
  83. {
  84. char const *nptr = utf8_name;
  85. int wlen = wcslen(wide_name);
  86. int u8len = strlen(utf8_name);
  87. std::mbstate_t mb=std::mbstate_t();
  88. wchar_t const *from_next = wide_name;
  89. wchar_t const *real_from_end = wide_name + wlen;
  90. char buf[256];
  91. char *to = buf;
  92. char *to_next = to;
  93. char *to_end = to + n;
  94. char *real_to_end = buf + sizeof(buf);
  95. while(from_next < real_from_end) {
  96. wchar_t const *from = from_next;
  97. wchar_t const *from_end = from + m;
  98. if(from_end > real_from_end)
  99. from_end = real_from_end;
  100. if(to_end == to) {
  101. to_end = to+n;
  102. }
  103. std::codecvt_base::result r = cvt.out(mb,from,from_end,from_next,to,to_end,to_next);
  104. //std::cout << "In from_size=" << (end-from) << " from move=" << (from_next - from) << " to move= " << to_next - to << " state = " << res(r) << std::endl;
  105. if(r == cvt_type::partial) {
  106. TEST(to_end - to_next < cvt.max_length());
  107. to_end += n;
  108. if(to_end > real_to_end)
  109. to_end = real_to_end;
  110. }
  111. else {
  112. TEST(r == cvt_type::ok);
  113. }
  114. while(to!=to_next) {
  115. TEST(*nptr == *to);
  116. nptr++;
  117. to++;
  118. }
  119. from = from_next;
  120. }
  121. TEST(nptr == utf8_name + u8len);
  122. TEST(from_next == real_from_end);
  123. TEST(cvt.unshift(mb,to,to+n,to_next)==cvt_type::ok);
  124. TEST(to_next == to);
  125. }
  126. void test_codecvt_conv()
  127. {
  128. std::cout << "Conversions " << std::endl;
  129. std::locale l(std::locale::classic(),new boost::locale::utf8_codecvt<wchar_t>());
  130. cvt_type const &cvt = std::use_facet<cvt_type>(l);
  131. TEST(cvt.max_length()==4);
  132. for(int i=1;i<=(int)strlen(utf8_name)+1;i++) {
  133. for(int j=1;j<=(int)wcslen(wide_name)+1;j++) {
  134. try {
  135. test_codecvt_in_n_m(cvt,i,j);
  136. test_codecvt_out_n_m(cvt,i,j);
  137. }
  138. catch(...) {
  139. std::cerr << "Wlen=" <<j << " Nlen=" << i << std::endl;
  140. throw;
  141. }
  142. }
  143. }
  144. }
  145. void test_codecvt_err()
  146. {
  147. std::cout << "Errors " << std::endl;
  148. std::locale l(std::locale::classic(),new boost::locale::utf8_codecvt<wchar_t>());
  149. cvt_type const &cvt = std::use_facet<cvt_type>(l);
  150. std::cout << "- UTF-8" << std::endl;
  151. {
  152. wchar_t buf[2];
  153. wchar_t *to=buf;
  154. wchar_t *to_end = buf+2;
  155. wchar_t *to_next = to;
  156. char const *err_utf="1\xFF\xFF";
  157. {
  158. std::mbstate_t mb=std::mbstate_t();
  159. char const *from=err_utf;
  160. char const *from_end = from + strlen(from);
  161. char const *from_next = from;
  162. to_next = to;
  163. TEST(cvt.in(mb,from,from_end,from_next,to,to_end,to_next)==cvt_type::error);
  164. TEST(from_next == from+1);
  165. TEST(to_next == to + 1);
  166. TEST(*to == '1');
  167. }
  168. err_utf++;
  169. {
  170. std::mbstate_t mb=std::mbstate_t();
  171. char const *from=err_utf;
  172. char const *from_end = from + strlen(from);
  173. char const *from_next = from;
  174. TEST(cvt.in(mb,from,from_end,from_next,to,to_end,to_next)==cvt_type::error);
  175. TEST(from_next == from);
  176. TEST(to_next == to);
  177. }
  178. }
  179. std::cout << "- UTF-16/32" << std::endl;
  180. {
  181. char buf[32];
  182. char *to=buf;
  183. char *to_end = buf+32;
  184. char *to_next = to;
  185. wchar_t err_buf[3] = { '1' , 0xDC9E }; // second surrogate not works both for UTF-16 and 32
  186. wchar_t const *err_utf = err_buf;
  187. {
  188. std::mbstate_t mb=std::mbstate_t();
  189. wchar_t const *from=err_utf;
  190. wchar_t const *from_end = from + wcslen(from);
  191. wchar_t const *from_next = from;
  192. TEST(cvt.out(mb,from,from_end,from_next,to,to_end,to_next)==cvt_type::error);
  193. TEST(from_next == from+1);
  194. TEST(to_next == to + 1);
  195. TEST(*to == '1');
  196. }
  197. err_utf++;
  198. {
  199. std::mbstate_t mb=std::mbstate_t();
  200. wchar_t const *from=err_utf;
  201. wchar_t const *from_end = from + wcslen(from);
  202. wchar_t const *from_next = from;
  203. to_next = to;
  204. TEST(cvt.out(mb,from,from_end,from_next,to,to_end,to_next)==cvt_type::error);
  205. TEST(from_next == from);
  206. TEST(to_next == to);
  207. }
  208. }
  209. }
  210. void test_char_char()
  211. {
  212. std::cout << "Char-char specialization"<<std::endl;
  213. std::locale l(std::locale::classic(),new boost::locale::utf8_codecvt<char>());
  214. std::codecvt<char,char,std::mbstate_t> const &cvt=std::use_facet<std::codecvt<char,char,std::mbstate_t> >(l);
  215. std::mbstate_t mb=std::mbstate_t();
  216. char const *from = "a";
  217. char const *from_end = from+1;
  218. char const *from_next = from;
  219. char buf[2];
  220. char *to = buf;
  221. char *to_end = buf+1;
  222. char *to_next = to;
  223. TEST(cvt.always_noconv()==true);
  224. TEST(cvt.in(mb,from,from_end,from_next,to,to_end,to_next)==cvt_type::noconv);
  225. TEST(from_next == from);
  226. TEST(to_next == to);
  227. TEST(cvt.out(mb,from,from_end,from_next,to,to_end,to_next)==cvt_type::noconv);
  228. TEST(from_next == from);
  229. TEST(to_next == to);
  230. TEST(cvt.encoding()==1);
  231. TEST(cvt.max_length()==1);
  232. }
  233. int main()
  234. {
  235. try {
  236. test_codecvt_conv();
  237. test_codecvt_err();
  238. test_char_char();
  239. }
  240. catch(std::exception const &e) {
  241. std::cerr << "Failed : " << e.what() << std::endl;
  242. return 1;
  243. }
  244. std::cout << "Ok" << std::endl;
  245. return 0;
  246. }
  247. ///
  248. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4