utf8_checker.ipp 9.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331
  1. //
  2. // Copyright (c) 2016-2019 Vinnie Falco (vinnie dot falco at gmail dot com)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  5. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  6. //
  7. // Official repository: https://github.com/boostorg/beast
  8. //
  9. #ifndef BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
  10. #define BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP
  11. #include <boost/beast/websocket/detail/utf8_checker.hpp>
  12. #include <boost/assert.hpp>
  13. namespace boost {
  14. namespace beast {
  15. namespace websocket {
  16. namespace detail {
  17. void
  18. utf8_checker::
  19. reset()
  20. {
  21. need_ = 0;
  22. p_ = cp_;
  23. }
  24. bool
  25. utf8_checker::
  26. finish()
  27. {
  28. auto const success = need_ == 0;
  29. reset();
  30. return success;
  31. }
  32. bool
  33. utf8_checker::
  34. write(std::uint8_t const* in, std::size_t size)
  35. {
  36. auto const valid =
  37. [](std::uint8_t const*& p)
  38. {
  39. if(p[0] < 128)
  40. {
  41. ++p;
  42. return true;
  43. }
  44. if((p[0] & 0xe0) == 0xc0)
  45. {
  46. if( (p[1] & 0xc0) != 0x80 ||
  47. (p[0] & 0x1e) == 0) // overlong
  48. return false;
  49. p += 2;
  50. return true;
  51. }
  52. if((p[0] & 0xf0) == 0xe0)
  53. {
  54. if( (p[1] & 0xc0) != 0x80
  55. || (p[2] & 0xc0) != 0x80
  56. || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
  57. || (p[0] == 0xed && (p[1] & 0x20) == 0x20) // surrogate
  58. //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
  59. )
  60. return false;
  61. p += 3;
  62. return true;
  63. }
  64. if((p[0] & 0xf8) == 0xf0)
  65. {
  66. if( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
  67. || (p[1] & 0xc0) != 0x80
  68. || (p[2] & 0xc0) != 0x80
  69. || (p[3] & 0xc0) != 0x80
  70. || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
  71. || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4 // > U+10FFFF
  72. )
  73. return false;
  74. p += 4;
  75. return true;
  76. }
  77. return false;
  78. };
  79. auto const fail_fast =
  80. [&]()
  81. {
  82. if(cp_[0] < 128)
  83. {
  84. return false;
  85. }
  86. const auto& p = cp_; // alias, only to keep this code similar to valid() above
  87. const auto known_only = p_ - cp_;
  88. if (known_only == 1)
  89. {
  90. if((p[0] & 0xe0) == 0xc0)
  91. {
  92. return ((p[0] & 0x1e) == 0); // overlong
  93. }
  94. if((p[0] & 0xf0) == 0xe0)
  95. {
  96. return false;
  97. }
  98. if((p[0] & 0xf8) == 0xf0)
  99. {
  100. return ((p[0] & 0x07) >= 0x05); // invalid F5...FF characters
  101. }
  102. }
  103. else if (known_only == 2)
  104. {
  105. if((p[0] & 0xe0) == 0xc0)
  106. {
  107. return ((p[1] & 0xc0) != 0x80 ||
  108. (p[0] & 0x1e) == 0); // overlong
  109. }
  110. if((p[0] & 0xf0) == 0xe0)
  111. {
  112. return ( (p[1] & 0xc0) != 0x80
  113. || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
  114. || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
  115. }
  116. if((p[0] & 0xf8) == 0xf0)
  117. {
  118. return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
  119. || (p[1] & 0xc0) != 0x80
  120. || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
  121. || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
  122. }
  123. }
  124. else if (known_only == 3)
  125. {
  126. if((p[0] & 0xe0) == 0xc0)
  127. {
  128. return ( (p[1] & 0xc0) != 0x80
  129. || (p[0] & 0x1e) == 0); // overlong
  130. }
  131. if((p[0] & 0xf0) == 0xe0)
  132. {
  133. return ( (p[1] & 0xc0) != 0x80
  134. || (p[2] & 0xc0) != 0x80
  135. || (p[0] == 0xe0 && (p[1] & 0x20) == 0) // overlong
  136. || (p[0] == 0xed && (p[1] & 0x20) == 0x20)); // surrogate
  137. //|| (p[0] == 0xef && p[1] == 0xbf && (p[2] & 0xfe) == 0xbe) // U+FFFE or U+FFFF
  138. }
  139. if((p[0] & 0xf8) == 0xf0)
  140. {
  141. return ( (p[0] & 0x07) >= 0x05 // invalid F5...FF characters
  142. || (p[1] & 0xc0) != 0x80
  143. || (p[2] & 0xc0) != 0x80
  144. || (p[0] == 0xf0 && (p[1] & 0x30) == 0) // overlong
  145. || (p[0] == 0xf4 && p[1] > 0x8f) || p[0] > 0xf4); // > U+10FFFF
  146. }
  147. }
  148. return true;
  149. };
  150. auto const needed =
  151. [](std::uint8_t const v)
  152. {
  153. if(v < 128)
  154. return 1;
  155. if(v < 192)
  156. return 0;
  157. if(v < 224)
  158. return 2;
  159. if(v < 240)
  160. return 3;
  161. if(v < 248)
  162. return 4;
  163. return 0;
  164. };
  165. auto const end = in + size;
  166. // Finish up any incomplete code point
  167. if(need_ > 0)
  168. {
  169. // Calculate what we have
  170. auto n = (std::min)(size, need_);
  171. size -= n;
  172. need_ -= n;
  173. // Add characters to the code point
  174. while(n--)
  175. *p_++ = *in++;
  176. BOOST_ASSERT(p_ <= cp_ + 4);
  177. // Still incomplete?
  178. if(need_ > 0)
  179. {
  180. // Incomplete code point
  181. BOOST_ASSERT(in == end);
  182. // Do partial validation on the incomplete
  183. // code point, this is called "Fail fast"
  184. // in Autobahn|Testsuite parlance.
  185. return ! fail_fast();
  186. }
  187. // Complete code point, validate it
  188. std::uint8_t const* p = &cp_[0];
  189. if(! valid(p))
  190. return false;
  191. p_ = cp_;
  192. }
  193. if(size <= sizeof(std::size_t))
  194. goto slow;
  195. // Align `in` to sizeof(std::size_t) boundary
  196. {
  197. auto const in0 = in;
  198. auto last = reinterpret_cast<std::uint8_t const*>(
  199. ((reinterpret_cast<std::uintptr_t>(in) + sizeof(std::size_t) - 1) /
  200. sizeof(std::size_t)) * sizeof(std::size_t));
  201. // Check one character at a time for low-ASCII
  202. while(in < last)
  203. {
  204. if(*in & 0x80)
  205. {
  206. // Not low-ASCII so switch to slow loop
  207. size = size - (in - in0);
  208. goto slow;
  209. }
  210. ++in;
  211. }
  212. size = size - (in - in0);
  213. }
  214. // Fast loop: Process 4 or 8 low-ASCII characters at a time
  215. {
  216. auto const in0 = in;
  217. auto last = in + size - 7;
  218. auto constexpr mask = static_cast<
  219. std::size_t>(0x8080808080808080 & ~std::size_t{0});
  220. while(in < last)
  221. {
  222. #if 0
  223. std::size_t temp;
  224. std::memcpy(&temp, in, sizeof(temp));
  225. if((temp & mask) != 0)
  226. #else
  227. // Technically UB but works on all known platforms
  228. if((*reinterpret_cast<std::size_t const*>(in) & mask) != 0)
  229. #endif
  230. {
  231. size = size - (in - in0);
  232. goto slow;
  233. }
  234. in += sizeof(std::size_t);
  235. }
  236. // There's at least one more full code point left
  237. last += 4;
  238. while(in < last)
  239. if(! valid(in))
  240. return false;
  241. goto tail;
  242. }
  243. slow:
  244. // Slow loop: Full validation on one code point at a time
  245. {
  246. auto last = in + size - 3;
  247. while(in < last)
  248. if(! valid(in))
  249. return false;
  250. }
  251. tail:
  252. // Handle the remaining bytes. The last
  253. // characters could split a code point so
  254. // we save the partial code point for later.
  255. //
  256. // On entry to the loop, `in` points to the
  257. // beginning of a code point.
  258. //
  259. for(;;)
  260. {
  261. // Number of chars left
  262. auto n = end - in;
  263. if(! n)
  264. break;
  265. // Chars we need to finish this code point
  266. auto const need = needed(*in);
  267. if(need == 0)
  268. return false;
  269. if(need <= n)
  270. {
  271. // Check a whole code point
  272. if(! valid(in))
  273. return false;
  274. }
  275. else
  276. {
  277. // Calculate how many chars we need
  278. // to finish this partial code point
  279. need_ = need - n;
  280. // Save the partial code point
  281. while(n--)
  282. *p_++ = *in++;
  283. BOOST_ASSERT(in == end);
  284. BOOST_ASSERT(p_ <= cp_ + 4);
  285. // Do partial validation on the incomplete
  286. // code point, this is called "Fail fast"
  287. // in Autobahn|Testsuite parlance.
  288. return ! fail_fast();
  289. }
  290. }
  291. return true;
  292. }
  293. bool
  294. check_utf8(char const* p, std::size_t n)
  295. {
  296. utf8_checker c;
  297. if(! c.write(reinterpret_cast<const uint8_t*>(p), n))
  298. return false;
  299. return c.finish();
  300. }
  301. } // detail
  302. } // websocket
  303. } // beast
  304. } // boost
  305. #endif // BOOST_BEAST_WEBSOCKET_DETAIL_UTF8_CHECKER_IPP