123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321 |
- /* Boost.Flyweight example of flyweight-based formatted text processing.
- *
- * Copyright 2006-2014 Joaquin M Lopez Munoz.
- * Distributed under the Boost Software License, Version 1.0.
- * (See accompanying file LICENSE_1_0.txt or copy at
- * http://www.boost.org/LICENSE_1_0.txt)
- *
- * See http://www.boost.org/libs/flyweight for library home page.
- */
- #include <boost/flyweight.hpp>
- #include <boost/functional/hash.hpp>
- #include <algorithm>
- #include <cctype>
- #include <cstdio>
- #include <fstream>
- #include <iostream>
- #include <iterator>
- #include <sstream>
- #include <string>
- #include <vector>
- #if defined(BOOST_NO_STDC_NAMESPACE)
- namespace std{using ::exit;using ::tolower;}
- #endif
- using namespace boost::flyweights;
- /* An HTML tag consists of a name and optional properties of the form
- * name1=value1 ... namen=valuen. We do not need to parse the properties
- * for the purposes of the program, hence they are all stored in
- * html_tag_data::properties in raw form.
- */
- struct html_tag_data
- {
- std::string name;
- std::string properties;
- };
- bool operator==(const html_tag_data& x,const html_tag_data& y)
- {
- return x.name==y.name&&x.properties==y.properties;
- }
- /* See the portability section of Boost.Hash at
- * http://boost.org/doc/html/hash/portability.html
- * for an explanation of the ADL-related workarounds.
- */
- #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
- namespace boost{
- #endif
- std::size_t hash_value(const html_tag_data& x)
- {
- std::size_t res=0;
- boost::hash_combine(res,x.name);
- boost::hash_combine(res,x.properties);
- return res;
- }
- #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
- } /* namespace boost */
- #endif
- typedef flyweight<html_tag_data> html_tag;
- /* parse_tag is passed an iterator positioned at the first char of
- * the tag after the opening '<' and returns, if succesful, a parsed tag
- * and whether it is opening (<xx>) or closing (</xx>).
- */
- enum tag_type{opening,closing,failure};
- struct parse_tag_res
- {
- parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()):
- type(type_),tag(tag_){}
- parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){}
- tag_type type;
- html_tag tag;
- };
- template<typename ForwardIterator>
- parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last)
- {
- html_tag_data tag;
- std::string buf;
- bool in_quote=false;
- for(ForwardIterator it=first;it!=last;){
- char ch=*it++;
- if(ch=='>'&&!in_quote){ /* ignore '>'s if inside quotes */
- tag_type type;
- std::string::size_type
- bname=buf.find_first_not_of("\t\n\r "),
- ename=bname==std::string::npos?
- std::string::npos:
- buf.find_first_of("\t\n\r ",bname),
- bprop=ename==std::string::npos?
- std::string::npos:
- buf.find_first_not_of("\t\n\r ",ename);
- if(bname==ename){ /* null name */
- return parse_tag_res(failure);
- }
- else if(buf[bname]=='/'){ /* closing tag */
- type=closing;
- ++bname;
- }
- else type=opening;
- tag.name=buf.substr(bname,ename-bname);
- std::transform( /* normalize tag name to lower case */
- tag.name.begin(),tag.name.end(),tag.name.begin(),
- (int(*)(int))std::tolower);
- if(bprop!=std::string::npos){
- tag.properties=buf.substr(bprop,buf.size());
- }
- first=it; /* result good, consume the chars */
- return parse_tag_res(type,tag);
- }
- else{
- if(ch=='"')in_quote=!in_quote;
- buf+=ch;
- }
- }
- return parse_tag_res(failure); /* end reached and found no '>' */
- }
- /* A character context is just a vector containing the tags enclosing the
- * character, from the outermost level to the innermost.
- */
- typedef std::vector<html_tag> html_context_data;
- typedef flyweight<html_context_data> html_context;
- /* A character is a char code plus its context.
- */
- struct character_data
- {
- character_data(char code_=0,html_context context_=html_context()):
- code(code_),context(context_){}
- character_data(const character_data& x):code(x.code),context(x.context){}
-
- char code;
- html_context context;
- };
- bool operator==(const character_data& x,const character_data& y)
- {
- return x.code==y.code&&x.context==y.context;
- }
- #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
- namespace boost{
- #endif
- std::size_t hash_value(const character_data& x)
- {
- std::size_t res=0;
- boost::hash_combine(res,x.code);
- boost::hash_combine(res,x.context);
- return res;
- }
- #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP)
- } /* namespace boost */
- #endif
- typedef flyweight<character_data> character;
- /* scan_html converts HTML code into a stream of contextualized characters.
- */
- template<typename ForwardIterator,typename OutputIterator>
- void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
- {
- html_context_data context;
- while(first!=last){
- if(*first=='<'){ /* tag found */
- ++first;
- parse_tag_res res=parse_tag(first,last);
- if(res.type==opening){ /* add to contex */
- context.push_back(res.tag);
- continue;
- }
- else if(res.type==closing){ /* remove from context */
- /* Pop all tags from the innermost to the matching one; this takes
- * care of missing </xx>s like vg. in <ul><li>hello</ul>.
- */
- for(html_context_data::reverse_iterator rit=context.rbegin();
- rit!=context.rend();++rit){
- if(rit->get().name==res.tag.get().name){
- context.erase(rit.base()-1,context.end());
- break;
- }
- }
- continue;
- }
- }
- *out++=character(*first++,html_context(context));
- }
- }
- /* HTML-producing utilities */
- void print_opening_tag(std::ostream& os,const html_tag_data& x)
- {
- os<<"<"<<x.name;
- if(!x.properties.empty())os<<" "<<x.properties;
- os<<">";
- }
- void print_closing_tag(std::ostream& os,const html_tag_data& x)
- {
- /* SGML declarations (beginning with '!') are not closed */
- if(x.name[0]!='!')os<<"</"<<x.name<<">";
- }
- /* change_context takes contexts from and to with tags
- *
- * from<- c1 ... cn fn+1 ... fm
- * to <- c1 ... cn tn+1 ... tk
- *
- * (that is, they share the first n tags, n might be 0), and
- * produces code closing fm ... fn+1 and opening tn+1 ... tk.
- */
- template<typename OutputIterator>
- void change_context(
- const html_context_data& from,const html_context_data& to,
- OutputIterator out)
- {
- std::ostringstream oss;
- html_context_data::const_iterator
- it0=from.begin(),
- it0_end=from.end(),
- it1=to.begin(),
- it1_end=to.end();
- for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1);
- while(it0_end!=it0)print_closing_tag(oss,*--it0_end);
- while(it1!=it1_end)print_opening_tag(oss,*it1++);
- std::string str=oss.str();
- std::copy(str.begin(),str.end(),out);
- }
- /* produce_html is passed a bunch of contextualized characters and emits
- * the corresponding HTML. The algorithm is simple: tags are opened and closed
- * as a result of the context from one character to the following changing.
- */
- template<typename ForwardIterator,typename OutputIterator>
- void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out)
- {
- html_context context;
- while(first!=last){
- if(first->get().context!=context){
- change_context(context,first->get().context,out);
- context=first->get().context;
- }
- *out++=(first++)->get().code;
- }
- change_context(context,html_context(),out); /* close remaining context */
- }
- /* Without these explicit instantiations, MSVC++ 6.5/7.0 does not
- * find some friend operators in certain contexts.
- */
- character dummy1;
- html_tag dummy2;
- int main()
- {
- std::cout<<"input html file: ";
- std::string in;
- std::getline(std::cin,in);
- std::ifstream ifs(in.c_str());
- if(!ifs){
- std::cout<<"can't open "<<in<<std::endl;
- std::exit(EXIT_FAILURE);
- }
- typedef std::istreambuf_iterator<char> istrbuf_iterator;
- std::vector<char> html_source;
- std::copy(
- istrbuf_iterator(ifs),istrbuf_iterator(),
- std::back_inserter(html_source));
- /* parse the HTML */
-
- std::vector<character> scanned_html;
- scan_html(
- html_source.begin(),html_source.end(),std::back_inserter(scanned_html));
- /* Now that we have the text as a vector of contextualized characters,
- * we can shuffle it around and manipulate in almost any way we please.
- * For instance, the following reverses the central portion of the doc.
- */
- std::reverse(
- scanned_html.begin()+scanned_html.size()/4,
- scanned_html.begin()+3*(scanned_html.size()/4));
- /* emit the resulting HTML */
- std::cout<<"output html file: ";
- std::string out;
- std::getline(std::cin,out);
- std::ofstream ofs(out.c_str());
- if(!ofs){
- std::cout<<"can't open "<<out<<std::endl;
- std::exit(EXIT_FAILURE);
- }
- typedef std::ostreambuf_iterator<char> ostrbuf_iterator;
- produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs));
- return 0;
- }
|