parser.hpp 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530
  1. #ifndef BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_PARSER_HPP
  2. #define BOOST_PROPERTY_TREE_DETAIL_JSON_PARSER_PARSER_HPP
  3. #include <boost/property_tree/json_parser/error.hpp>
  4. #include <boost/ref.hpp>
  5. #include <boost/bind.hpp>
  6. #include <boost/format.hpp>
  7. #include <iterator>
  8. #include <sstream>
  9. #include <string>
  10. namespace boost { namespace property_tree {
  11. namespace json_parser { namespace detail
  12. {
  13. template <typename Encoding, typename Iterator, typename Sentinel>
  14. class source
  15. {
  16. public:
  17. typedef typename std::iterator_traits<Iterator>::value_type
  18. code_unit;
  19. typedef bool (Encoding::*encoding_predicate)(code_unit c) const;
  20. explicit source(Encoding& encoding) : encoding(encoding) {}
  21. template <typename Range>
  22. void set_input(const std::string& filename, const Range& r)
  23. {
  24. this->filename = filename;
  25. cur = r.begin();
  26. end = r.end();
  27. // Note that there is no backtracking, so if e.g. a UTF-8 file
  28. // starts with something that initially looks like a BOM but isn't,
  29. // there's trouble.
  30. // However, no valid JSON file can start with a UTF-8 EF byte.
  31. encoding.skip_introduction(cur, end);
  32. line = 1;
  33. offset = 0;
  34. }
  35. bool done() const { return cur == end; }
  36. void parse_error(const char* msg) {
  37. BOOST_PROPERTY_TREE_THROW(
  38. json_parser_error(msg, filename, line));
  39. }
  40. void next() {
  41. if (encoding.is_nl(*cur)) {
  42. ++line;
  43. offset = 0;
  44. } else {
  45. ++offset;
  46. }
  47. ++cur;
  48. }
  49. template <typename Action>
  50. bool have(encoding_predicate p, Action& a) {
  51. bool found = cur != end && (encoding.*p)(*cur);
  52. if (found) {
  53. a(*cur);
  54. next();
  55. }
  56. return found;
  57. }
  58. bool have(encoding_predicate p) {
  59. DoNothing n;
  60. return have(p, n);
  61. }
  62. template <typename Action>
  63. void expect(encoding_predicate p, const char* msg, Action& a) {
  64. if (!have(p, a)) {
  65. parse_error(msg);
  66. }
  67. }
  68. void expect(encoding_predicate p, const char* msg) {
  69. DoNothing n;
  70. expect(p, msg, n);
  71. }
  72. code_unit need_cur(const char* msg) {
  73. if (cur == end) {
  74. parse_error(msg);
  75. }
  76. return *cur;
  77. }
  78. Iterator& raw_cur() { return cur; }
  79. Sentinel raw_end() { return end; }
  80. private:
  81. struct DoNothing {
  82. void operator ()(code_unit) const {}
  83. };
  84. Encoding& encoding;
  85. Iterator cur;
  86. Sentinel end;
  87. std::string filename;
  88. int line;
  89. int offset;
  90. };
  91. template <typename Callbacks, typename Encoding, typename Iterator,
  92. typename = typename std::iterator_traits<Iterator>
  93. ::iterator_category>
  94. class number_callback_adapter
  95. {
  96. public:
  97. number_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  98. Iterator& cur)
  99. : callbacks(callbacks), encoding(encoding), first(cur), cur(cur)
  100. {}
  101. void operator ()(typename Encoding::external_char) {}
  102. void finish() const {
  103. callbacks.on_number(encoding.to_internal(first, cur));
  104. }
  105. private:
  106. number_callback_adapter(const number_callback_adapter&);
  107. Callbacks& callbacks;
  108. Encoding& encoding;
  109. Iterator first;
  110. Iterator& cur;
  111. };
  112. template <typename Callbacks, typename Encoding, typename Iterator>
  113. class number_callback_adapter<Callbacks, Encoding, Iterator,
  114. std::input_iterator_tag>
  115. {
  116. public:
  117. number_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  118. Iterator&)
  119. : callbacks(callbacks), encoding(encoding), first(true)
  120. {}
  121. void operator ()(typename Encoding::external_char c) {
  122. if (first) {
  123. callbacks.on_begin_number();
  124. first = false;
  125. }
  126. callbacks.on_digit(encoding.to_internal_trivial(c));
  127. }
  128. void finish() const {
  129. callbacks.on_end_number();
  130. }
  131. private:
  132. number_callback_adapter(const number_callback_adapter&);
  133. Callbacks& callbacks;
  134. Encoding& encoding;
  135. bool first;
  136. };
  137. template <typename Callbacks, typename Encoding, typename Iterator,
  138. typename = typename std::iterator_traits<Iterator>
  139. ::iterator_category>
  140. class string_callback_adapter
  141. {
  142. public:
  143. string_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  144. Iterator& cur)
  145. : callbacks(callbacks), encoding(encoding), cur(cur),
  146. run_begin(cur)
  147. {}
  148. void start_run() {
  149. run_begin = cur;
  150. }
  151. void finish_run() {
  152. callbacks.on_code_units(encoding.to_internal(run_begin, cur));
  153. }
  154. template <typename Sentinel, typename EncodingErrorFn>
  155. void process_codepoint(Sentinel end, EncodingErrorFn error_fn) {
  156. encoding.skip_codepoint(cur, end, error_fn);
  157. }
  158. private:
  159. string_callback_adapter(const string_callback_adapter&);
  160. Callbacks& callbacks;
  161. Encoding& encoding;
  162. Iterator& cur;
  163. Iterator run_begin;
  164. };
  165. template <typename Callbacks, typename Encoding, typename Iterator>
  166. class string_callback_adapter<Callbacks, Encoding, Iterator,
  167. std::input_iterator_tag>
  168. {
  169. public:
  170. string_callback_adapter(Callbacks& callbacks, Encoding& encoding,
  171. Iterator& cur)
  172. : callbacks(callbacks), encoding(encoding), cur(cur)
  173. {}
  174. void start_run() {}
  175. void finish_run() {}
  176. template <typename Sentinel, typename EncodingErrorFn>
  177. void process_codepoint(Sentinel end, EncodingErrorFn error_fn) {
  178. encoding.transcode_codepoint(cur, end,
  179. boost::bind(&Callbacks::on_code_unit,
  180. boost::ref(callbacks), _1),
  181. error_fn);
  182. }
  183. private:
  184. string_callback_adapter(const string_callback_adapter&);
  185. Callbacks& callbacks;
  186. Encoding& encoding;
  187. Iterator& cur;
  188. };
  189. template <typename Callbacks, typename Encoding, typename Iterator,
  190. typename Sentinel>
  191. class parser
  192. {
  193. typedef detail::number_callback_adapter<Callbacks, Encoding, Iterator>
  194. number_adapter;
  195. typedef detail::string_callback_adapter<Callbacks, Encoding, Iterator>
  196. string_adapter;
  197. typedef detail::source<Encoding, Iterator, Sentinel> source;
  198. typedef typename source::code_unit code_unit;
  199. public:
  200. parser(Callbacks& callbacks, Encoding& encoding)
  201. : callbacks(callbacks), encoding(encoding), src(encoding)
  202. {}
  203. template <typename Range>
  204. void set_input(const std::string& filename, const Range& r) {
  205. src.set_input(filename, r);
  206. }
  207. void finish() {
  208. skip_ws();
  209. if (!src.done()) {
  210. parse_error("garbage after data");
  211. }
  212. }
  213. void parse_value() {
  214. if (parse_object()) return;
  215. if (parse_array()) return;
  216. if (parse_string()) return;
  217. if (parse_boolean()) return;
  218. if (parse_null()) return;
  219. if (parse_number()) return;
  220. parse_error("expected value");
  221. }
  222. bool parse_null() {
  223. skip_ws();
  224. if (!have(&Encoding::is_n)) {
  225. return false;
  226. }
  227. expect(&Encoding::is_u, "expected 'null'");
  228. expect(&Encoding::is_l, "expected 'null'");
  229. expect(&Encoding::is_l, "expected 'null'");
  230. callbacks.on_null();
  231. return true;
  232. }
  233. bool parse_boolean() {
  234. skip_ws();
  235. if (have(&Encoding::is_t)) {
  236. expect(&Encoding::is_r, "expected 'true'");
  237. expect(&Encoding::is_u, "expected 'true'");
  238. expect(&Encoding::is_e, "expected 'true'");
  239. callbacks.on_boolean(true);
  240. return true;
  241. }
  242. if (have(&Encoding::is_f)) {
  243. expect(&Encoding::is_a, "expected 'false'");
  244. expect(&Encoding::is_l, "expected 'false'");
  245. expect(&Encoding::is_s, "expected 'false'");
  246. expect(&Encoding::is_e, "expected 'false'");
  247. callbacks.on_boolean(false);
  248. return true;
  249. }
  250. return false;
  251. }
  252. bool parse_number() {
  253. skip_ws();
  254. number_adapter adapter(callbacks, encoding, src.raw_cur());
  255. bool started = false;
  256. if (have(&Encoding::is_minus, adapter)) {
  257. started = true;
  258. }
  259. if (!have(&Encoding::is_0, adapter) && !parse_int_part(adapter)) {
  260. if (started) {
  261. parse_error("expected digits after -");
  262. }
  263. return false;
  264. }
  265. parse_frac_part(adapter);
  266. parse_exp_part(adapter);
  267. adapter.finish();
  268. return true;
  269. }
  270. bool parse_string() {
  271. skip_ws();
  272. if (!have(&Encoding::is_quote)) {
  273. return false;
  274. }
  275. callbacks.on_begin_string();
  276. string_adapter adapter(callbacks, encoding, src.raw_cur());
  277. while (!encoding.is_quote(need_cur("unterminated string"))) {
  278. if (encoding.is_backslash(*src.raw_cur())) {
  279. adapter.finish_run();
  280. next();
  281. parse_escape();
  282. adapter.start_run();
  283. } else {
  284. adapter.process_codepoint(src.raw_end(),
  285. boost::bind(&parser::parse_error,
  286. this, "invalid code sequence"));
  287. }
  288. }
  289. adapter.finish_run();
  290. callbacks.on_end_string();
  291. next();
  292. return true;
  293. }
  294. bool parse_array() {
  295. skip_ws();
  296. if (!have(&Encoding::is_open_bracket)) {
  297. return false;
  298. }
  299. callbacks.on_begin_array();
  300. skip_ws();
  301. if (have(&Encoding::is_close_bracket)) {
  302. callbacks.on_end_array();
  303. return true;
  304. }
  305. do {
  306. parse_value();
  307. skip_ws();
  308. } while (have(&Encoding::is_comma));
  309. expect(&Encoding::is_close_bracket, "expected ']' or ','");
  310. callbacks.on_end_array();
  311. return true;
  312. }
  313. bool parse_object() {
  314. skip_ws();
  315. if (!have(&Encoding::is_open_brace)) {
  316. return false;
  317. }
  318. callbacks.on_begin_object();
  319. skip_ws();
  320. if (have(&Encoding::is_close_brace)) {
  321. callbacks.on_end_object();
  322. return true;
  323. }
  324. do {
  325. if (!parse_string()) {
  326. parse_error("expected key string");
  327. }
  328. skip_ws();
  329. expect(&Encoding::is_colon, "expected ':'");
  330. parse_value();
  331. skip_ws();
  332. } while (have(&Encoding::is_comma));
  333. expect(&Encoding::is_close_brace, "expected '}' or ','");
  334. callbacks.on_end_object();
  335. return true;
  336. }
  337. private:
  338. typedef typename source::encoding_predicate encoding_predicate;
  339. void parse_error(const char* msg) { src.parse_error(msg); }
  340. void next() { src.next(); }
  341. template <typename Action>
  342. bool have(encoding_predicate p, Action& a) { return src.have(p, a); }
  343. bool have(encoding_predicate p) { return src.have(p); }
  344. template <typename Action>
  345. void expect(encoding_predicate p, const char* msg, Action& a) {
  346. src.expect(p, msg, a);
  347. }
  348. void expect(encoding_predicate p, const char* msg) {
  349. src.expect(p, msg);
  350. }
  351. code_unit need_cur(const char* msg) { return src.need_cur(msg); }
  352. void skip_ws() {
  353. while (have(&Encoding::is_ws)) {
  354. }
  355. }
  356. bool parse_int_part(number_adapter& action) {
  357. if (!have(&Encoding::is_digit0, action)) {
  358. return false;
  359. }
  360. parse_digits(action);
  361. return true;
  362. }
  363. void parse_frac_part(number_adapter& action) {
  364. if (!have(&Encoding::is_dot, action)) {
  365. return;
  366. }
  367. expect(&Encoding::is_digit, "need at least one digit after '.'",
  368. action);
  369. parse_digits(action);
  370. }
  371. void parse_exp_part(number_adapter& action) {
  372. if (!have(&Encoding::is_eE, action)) {
  373. return;
  374. }
  375. have(&Encoding::is_plusminus, action);
  376. expect(&Encoding::is_digit, "need at least one digit in exponent",
  377. action);
  378. parse_digits(action);
  379. }
  380. void parse_digits(number_adapter& action) {
  381. while (have(&Encoding::is_digit, action)) {
  382. }
  383. }
  384. void parse_escape() {
  385. if (have(&Encoding::is_quote)) {
  386. feed(0x22);
  387. } else if (have(&Encoding::is_backslash)) {
  388. feed(0x5c);
  389. } else if (have(&Encoding::is_slash)) {
  390. feed(0x2f);
  391. } else if (have(&Encoding::is_b)) {
  392. feed(0x08); // backspace
  393. } else if (have(&Encoding::is_f)) {
  394. feed(0x0c); // formfeed
  395. } else if (have(&Encoding::is_n)) {
  396. feed(0x0a); // line feed
  397. } else if (have(&Encoding::is_r)) {
  398. feed(0x0d); // carriage return
  399. } else if (have(&Encoding::is_t)) {
  400. feed(0x09); // horizontal tab
  401. } else if (have(&Encoding::is_u)) {
  402. parse_codepoint_ref();
  403. } else {
  404. parse_error("invalid escape sequence");
  405. }
  406. }
  407. unsigned parse_hex_quad() {
  408. unsigned codepoint = 0;
  409. for (int i = 0; i < 4; ++i) {
  410. int value = encoding.decode_hexdigit(
  411. need_cur("invalid escape sequence"));
  412. if (value < 0) {
  413. parse_error("invalid escape sequence");
  414. }
  415. codepoint *= 16;
  416. codepoint += value;
  417. next();
  418. }
  419. return codepoint;
  420. }
  421. static bool is_surrogate_high(unsigned codepoint) {
  422. return (codepoint & 0xfc00) == 0xd800;
  423. }
  424. static bool is_surrogate_low(unsigned codepoint) {
  425. return (codepoint & 0xfc00) == 0xdc00;
  426. }
  427. static unsigned combine_surrogates(unsigned high, unsigned low) {
  428. return 0x010000 + (((high & 0x3ff) << 10) | (low & 0x3ff));
  429. }
  430. void parse_codepoint_ref() {
  431. unsigned codepoint = parse_hex_quad();
  432. if (is_surrogate_low(codepoint)) {
  433. parse_error("invalid codepoint, stray low surrogate");
  434. }
  435. if (is_surrogate_high(codepoint)) {
  436. expect(&Encoding::is_backslash,
  437. "invalid codepoint, stray high surrogate");
  438. expect(&Encoding::is_u,
  439. "expected codepoint reference after high surrogate");
  440. int low = parse_hex_quad();
  441. if (!is_surrogate_low(low)) {
  442. parse_error("expected low surrogate after high surrogate");
  443. }
  444. codepoint = combine_surrogates(codepoint, low);
  445. }
  446. feed(codepoint);
  447. }
  448. void feed(unsigned codepoint) {
  449. encoding.feed_codepoint(codepoint,
  450. boost::bind(&Callbacks::on_code_unit,
  451. boost::ref(callbacks), _1));
  452. }
  453. Callbacks& callbacks;
  454. Encoding& encoding;
  455. source src;
  456. };
  457. }}}}
  458. #endif