html.cpp 8.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321
  1. /* Boost.Flyweight example of flyweight-based formatted text processing.
  2. *
  3. * Copyright 2006-2014 Joaquin M Lopez Munoz.
  4. * Distributed under the Boost Software License, Version 1.0.
  5. * (See accompanying file LICENSE_1_0.txt or copy at
  6. * http://www.boost.org/LICENSE_1_0.txt)
  7. *
  8. * See http://www.boost.org/libs/flyweight for library home page.
  9. */
  10. #include <boost/flyweight.hpp>
  11. #include <boost/functional/hash.hpp>
  12. #include <algorithm>
  13. #include <cctype>
  14. #include <cstdio>
  15. #include <fstream>
  16. #include <iostream>
  17. #include <iterator>
  18. #include <sstream>
  19. #include <string>
  20. #include <vector>
  21. #if defined(BOOST_NO_STDC_NAMESPACE)
  22. namespace std{using ::exit;using ::tolower;}
  23. #endif
  24. using namespace boost::flyweights;
  25. /* An HTML tag consists of a name and optional properties of the form
  26. * name1=value1 ... namen=valuen. We do not need to parse the properties
  27. * for the purposes of the program, hence they are all stored in
  28. * html_tag_data::properties in raw form.
  29. */
  30. struct html_tag_data
  31. {
  32. std::string name;
  33. std::string properties;
  34. };
  35. bool operator==(const html_tag_data& x,const html_tag_data& y)
  36. {
  37. return x.name==y.name&&x.properties==y.properties;
  38. }
  39. /* See the portability section of Boost.Hash at
  40. * http://boost.org/doc/html/hash/portability.html
  41. * for an explanation of the ADL-related workarounds.
  42. */
  43. #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
  44. namespace boost{
  45. #endif
  46. std::size_t hash_value(const html_tag_data& x)
  47. {
  48. std::size_t res=0;
  49. boost::hash_combine(res,x.name);
  50. boost::hash_combine(res,x.properties);
  51. return res;
  52. }
  53. #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
  54. } /* namespace boost */
  55. #endif
  56. typedef flyweight<html_tag_data> html_tag;
  57. /* parse_tag is passed an iterator positioned at the first char of
  58. * the tag after the opening '<' and returns, if succesful, a parsed tag
  59. * and whether it is opening (<xx>) or closing (</xx>).
  60. */
  61. enum tag_type{opening,closing,failure};
  62. struct parse_tag_res
  63. {
  64. parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
  65. type(type_),tag(tag_){}
  66. parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}
  67. tag_type type;
  68. html_tag tag;
  69. };
  70. template<typename ForwardIterator>
  71. parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
  72. {
  73. html_tag_data tag;
  74. std::string buf;
  75. bool in_quote=false;
  76. for(ForwardIterator it=first;it!=last;){
  77. char ch=*it++;
  78. if(ch=='>'&&!in_quote){ /* ignore '>'s if inside quotes */
  79. tag_type type;
  80. std::string::size_type
  81. bname=buf.find_first_not_of("\t\n\r "),
  82. ename=bname==std::string::npos?
  83. std::string::npos:
  84. buf.find_first_of("\t\n\r ",bname),
  85. bprop=ename==std::string::npos?
  86. std::string::npos:
  87. buf.find_first_not_of("\t\n\r ",ename);
  88. if(bname==ename){ /* null name */
  89. return parse_tag_res(failure);
  90. }
  91. else if(buf[bname]=='/'){ /* closing tag */
  92. type=closing;
  93. ++bname;
  94. }
  95. else type=opening;
  96. tag.name=buf.substr(bname,ename-bname);
  97. std::transform( /* normalize tag name to lower case */
  98. tag.name.begin(),tag.name.end(),tag.name.begin(),
  99. (int(*)(int))std::tolower);
  100. if(bprop!=std::string::npos){
  101. tag.properties=buf.substr(bprop,buf.size());
  102. }
  103. first=it; /* result good, consume the chars */
  104. return parse_tag_res(type,tag);
  105. }
  106. else{
  107. if(ch=='"')in_quote=!in_quote;
  108. buf+=ch;
  109. }
  110. }
  111. return parse_tag_res(failure); /* end reached and found no '>' */
  112. }
  113. /* A character context is just a vector containing the tags enclosing the
  114. * character, from the outermost level to the innermost.
  115. */
  116. typedef std::vector<html_tag> html_context_data;
  117. typedef flyweight<html_context_data> html_context;
  118. /* A character is a char code plus its context.
  119. */
  120. struct character_data
  121. {
  122. character_data(char code_=0,html_context context_=html_context()):
  123. code(code_),context(context_){}
  124. character_data(const character_data& x):code(x.code),context(x.context){}
  125. char code;
  126. html_context context;
  127. };
  128. bool operator==(const character_data& x,const character_data& y)
  129. {
  130. return x.code==y.code&&x.context==y.context;
  131. }
  132. #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
  133. namespace boost{
  134. #endif
  135. std::size_t hash_value(const character_data& x)
  136. {
  137. std::size_t res=0;
  138. boost::hash_combine(res,x.code);
  139. boost::hash_combine(res,x.context);
  140. return res;
  141. }
  142. #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
  143. } /* namespace boost */
  144. #endif
  145. typedef flyweight<character_data> character;
  146. /* scan_html converts HTML code into a stream of contextualized characters.
  147. */
  148. template<typename ForwardIterator,typename OutputIterator>
  149. void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
  150. {
  151. html_context_data context;
  152. while(first!=last){
  153. if(*first=='<'){ /* tag found */
  154. ++first;
  155. parse_tag_res res=parse_tag(first,last);
  156. if(res.type==opening){ /* add to contex */
  157. context.push_back(res.tag);
  158. continue;
  159. }
  160. else if(res.type==closing){ /* remove from context */
  161. /* Pop all tags from the innermost to the matching one; this takes
  162. * care of missing </xx>s like vg. in <ul><li>hello</ul>.
  163. */
  164. for(html_context_data::reverse_iterator rit=context.rbegin();
  165. rit!=context.rend();++rit){
  166. if(rit->get().name==res.tag.get().name){
  167. context.erase(rit.base()-1,context.end());
  168. break;
  169. }
  170. }
  171. continue;
  172. }
  173. }
  174. *out++=character(*first++,html_context(context));
  175. }
  176. }
  177. /* HTML-producing utilities */
  178. void print_opening_tag(std::ostream& os,const html_tag_data& x)
  179. {
  180. os<<"<"<<x.name;
  181. if(!x.properties.empty())os<<" "<<x.properties;
  182. os<<">";
  183. }
  184. void print_closing_tag(std::ostream& os,const html_tag_data& x)
  185. {
  186. /* SGML declarations (beginning with '!') are not closed */
  187. if(x.name[0]!='!')os<<"</"<<x.name<<">";
  188. }
  189. /* change_context takes contexts from and to with tags
  190. *
  191. * from<- c1 ... cn fn+1 ... fm
  192. * to <- c1 ... cn tn+1 ... tk
  193. *
  194. * (that is, they share the first n tags, n might be 0), and
  195. * produces code closing fm ... fn+1 and opening tn+1 ... tk.
  196. */
  197. template<typename OutputIterator>
  198. void change_context(
  199. const html_context_data& from,const html_context_data& to,
  200. OutputIterator out)
  201. {
  202. std::ostringstream oss;
  203. html_context_data::const_iterator
  204. it0=from.begin(),
  205. it0_end=from.end(),
  206. it1=to.begin(),
  207. it1_end=to.end();
  208. for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
  209. while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
  210. while(it1!=it1_end)print_opening_tag(oss,*it1++);
  211. std::string str=oss.str();
  212. std::copy(str.begin(),str.end(),out);
  213. }
  214. /* produce_html is passed a bunch of contextualized characters and emits
  215. * the corresponding HTML. The algorithm is simple: tags are opened and closed
  216. * as a result of the context from one character to the following changing.
  217. */
  218. template<typename ForwardIterator,typename OutputIterator>
  219. void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
  220. {
  221. html_context context;
  222. while(first!=last){
  223. if(first->get().context!=context){
  224. change_context(context,first->get().context,out);
  225. context=first->get().context;
  226. }
  227. *out++=(first++)->get().code;
  228. }
  229. change_context(context,html_context(),out); /* close remaining context */
  230. }
  231. /* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
  232. * find some friend operators in certain contexts.
  233. */
  234. character dummy1;
  235. html_tag dummy2;
  236. int main()
  237. {
  238. std::cout<<"input html file: ";
  239. std::string in;
  240. std::getline(std::cin,in);
  241. std::ifstream ifs(in.c_str());
  242. if(!ifs){
  243. std::cout<<"can't open "<<in<<std::endl;
  244. std::exit(EXIT_FAILURE);
  245. }
  246. typedef std::istreambuf_iterator<char> istrbuf_iterator;
  247. std::vector<char> html_source;
  248. std::copy(
  249. istrbuf_iterator(ifs),istrbuf_iterator(),
  250. std::back_inserter(html_source));
  251. /* parse the HTML */
  252. std::vector<character> scanned_html;
  253. scan_html(
  254. html_source.begin(),html_source.end(),std::back_inserter(scanned_html));
  255. /* Now that we have the text as a vector of contextualized characters,
  256. * we can shuffle it around and manipulate in almost any way we please.
  257. * For instance, the following reverses the central portion of the doc.
  258. */
  259. std::reverse(
  260. scanned_html.begin()+scanned_html.size()/4,
  261. scanned_html.begin()+3*(scanned_html.size()/4));
  262. /* emit the resulting HTML */
  263. std::cout<<"output html file: ";
  264. std::string out;
  265. std::getline(std::cin,out);
  266. std::ofstream ofs(out.c_str());
  267. if(!ofs){
  268. std::cout<<"can't open "<<out<<std::endl;
  269. std::exit(EXIT_FAILURE);
  270. }
  271. typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
  272. produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));
  273. return 0;
  274. }