word_count_functor.cpp 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183
  1. // Copyright (c) 2001-2010 Hartmut Kaiser
  2. //
  3. // Distributed under the Boost Software License, Version 1.0. (See accompanying
  4. // file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
  5. // This example is the equivalent to the following flex program:
  6. /*
  7. //[wcf_flex_version
  8. %{
  9. #define ID_WORD 1000
  10. #define ID_EOL 1001
  11. #define ID_CHAR 1002
  12. int c = 0, w = 0, l = 0;
  13. %}
  14. %%
  15. [^ \t\n]+ { return ID_WORD; }
  16. \n { return ID_EOL; }
  17. . { return ID_CHAR; }
  18. %%
  19. bool count(int tok)
  20. {
  21. switch (tok) {
  22. case ID_WORD: ++w; c += yyleng; break;
  23. case ID_EOL: ++l; ++c; break;
  24. case ID_CHAR: ++c; break;
  25. default:
  26. return false;
  27. }
  28. return true;
  29. }
  30. void main()
  31. {
  32. int tok = EOF;
  33. do {
  34. tok = yylex();
  35. if (!count(tok))
  36. break;
  37. } while (EOF != tok);
  38. printf("%d %d %d\n", l, w, c);
  39. }
  40. //]
  41. */
  42. // Its purpose is to do the word count function of the wc command in UNIX. It
  43. // prints the number of lines, words and characters in a file.
  44. //
  45. // This examples shows how to use the tokenize() function together with a
  46. // simple functor, which gets executed whenever a token got matched in the
  47. // input sequence.
  48. // #define BOOST_SPIRIT_LEXERTL_DEBUG
  49. #include <boost/config/warning_disable.hpp>
  50. //[wcf_includes
  51. #include <boost/spirit/include/lex_lexertl.hpp>
  52. #include <boost/bind.hpp>
  53. #include <boost/ref.hpp>
  54. //]
  55. #include <iostream>
  56. #include <string>
  57. #include "example.hpp"
  58. //[wcf_namespaces
  59. namespace lex = boost::spirit::lex;
  60. //]
  61. ///////////////////////////////////////////////////////////////////////////////
  62. // Token id definitions
  63. ///////////////////////////////////////////////////////////////////////////////
  64. //[wcf_token_ids
  65. enum token_ids
  66. {
  67. ID_WORD = 1000,
  68. ID_EOL,
  69. ID_CHAR
  70. };
  71. //]
  72. //[wcf_token_definition
  73. /*` The template `word_count_tokens` defines three different tokens:
  74. `ID_WORD`, `ID_EOL`, and `ID_CHAR`, representing a word (anything except
  75. a whitespace or a newline), a newline character, and any other character
  76. (`ID_WORD`, `ID_EOL`, and `ID_CHAR` are enum values representing the token
  77. ids, but could be anything else convertible to an integer as well).
  78. The direct base class of any token definition class needs to be the
  79. template `lex::lexer<>`, where the corresponding template parameter (here:
  80. `lex::lexertl::lexer<BaseIterator>`) defines which underlying lexer engine has
  81. to be used to provide the required state machine functionality. In this
  82. example we use the Lexertl based lexer engine as the underlying lexer type.
  83. */
  84. template <typename Lexer>
  85. struct word_count_tokens : lex::lexer<Lexer>
  86. {
  87. word_count_tokens()
  88. {
  89. // define tokens (the regular expression to match and the corresponding
  90. // token id) and add them to the lexer
  91. this->self.add
  92. ("[^ \t\n]+", ID_WORD) // words (anything except ' ', '\t' or '\n')
  93. ("\n", ID_EOL) // newline characters
  94. (".", ID_CHAR) // anything else is a plain character
  95. ;
  96. }
  97. };
  98. //]
  99. //[wcf_functor
  100. /*` In this example the struct 'counter' is used as a functor counting the
  101. characters, words and lines in the analyzed input sequence by identifying
  102. the matched tokens as passed from the /Spirit.Lex/ library.
  103. */
  104. struct counter
  105. {
  106. //<- this is an implementation detail specific to boost::bind and doesn't show
  107. // up in the documentation
  108. typedef bool result_type;
  109. //->
  110. // the function operator gets called for each of the matched tokens
  111. // c, l, w are references to the counters used to keep track of the numbers
  112. template <typename Token>
  113. bool operator()(Token const& t, std::size_t& c, std::size_t& w, std::size_t& l) const
  114. {
  115. switch (t.id()) {
  116. case ID_WORD: // matched a word
  117. // since we're using a default token type in this example, every
  118. // token instance contains a `iterator_range<BaseIterator>` as its token
  119. // attribute pointing to the matched character sequence in the input
  120. ++w; c += t.value().size();
  121. break;
  122. case ID_EOL: // matched a newline character
  123. ++l; ++c;
  124. break;
  125. case ID_CHAR: // matched something else
  126. ++c;
  127. break;
  128. }
  129. return true; // always continue to tokenize
  130. }
  131. };
  132. //]
  133. ///////////////////////////////////////////////////////////////////////////////
  134. //[wcf_main
  135. /*` The main function simply loads the given file into memory (as a
  136. `std::string`), instantiates an instance of the token definition template
  137. using the correct iterator type (`word_count_tokens<char const*>`),
  138. and finally calls `lex::tokenize`, passing an instance of the counter function
  139. object. The return value of `lex::tokenize()` will be `true` if the
  140. whole input sequence has been successfully tokenized, and `false` otherwise.
  141. */
  142. int main(int argc, char* argv[])
  143. {
  144. // these variables are used to count characters, words and lines
  145. std::size_t c = 0, w = 0, l = 0;
  146. // read input from the given file
  147. std::string str (read_from_file(1 == argc ? "word_count.input" : argv[1]));
  148. // create the token definition instance needed to invoke the lexical analyzer
  149. word_count_tokens<lex::lexertl::lexer<> > word_count_functor;
  150. // tokenize the given string, the bound functor gets invoked for each of
  151. // the matched tokens
  152. char const* first = str.c_str();
  153. char const* last = &first[str.size()];
  154. bool r = lex::tokenize(first, last, word_count_functor,
  155. boost::bind(counter(), _1, boost::ref(c), boost::ref(w), boost::ref(l)));
  156. // print results
  157. if (r) {
  158. std::cout << "lines: " << l << ", words: " << w
  159. << ", characters: " << c << "\n";
  160. }
  161. else {
  162. std::string rest(first, last);
  163. std::cout << "Lexical analysis failed\n" << "stopped at: \""
  164. << rest << "\"\n";
  165. }
  166. return 0;
  167. }
  168. //]