/* Boost.Flyweight example of flyweight-based formatted text processing. * * Copyright 2006-2014 Joaquin M Lopez Munoz. * Distributed under the Boost Software License, Version 1.0. * (See accompanying file LICENSE_1_0.txt or copy at * http://www.boost.org/LICENSE_1_0.txt) * * See http://www.boost.org/libs/flyweight for library home page. */ #include #include #include #include #include #include #include #include #include #include #include #if defined(BOOST_NO_STDC_NAMESPACE) namespace std{using ::exit;using ::tolower;} #endif using namespace boost::flyweights; /* An HTML tag consists of a name and optional properties of the form * name1=value1 ... namen=valuen. We do not need to parse the properties * for the purposes of the program, hence they are all stored in * html_tag_data::properties in raw form. */ struct html_tag_data { std::string name; std::string properties; }; bool operator==(const html_tag_data& x,const html_tag_data& y) { return x.name==y.name&&x.properties==y.properties; } /* See the portability section of Boost.Hash at * http://boost.org/doc/html/hash/portability.html * for an explanation of the ADL-related workarounds. */ #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) namespace boost{ #endif std::size_t hash_value(const html_tag_data& x) { std::size_t res=0; boost::hash_combine(res,x.name); boost::hash_combine(res,x.properties); return res; } #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) } /* namespace boost */ #endif typedef flyweight html_tag; /* parse_tag is passed an iterator positioned at the first char of * the tag after the opening '<' and returns, if succesful, a parsed tag * and whether it is opening () or closing (). */ enum tag_type{opening,closing,failure}; struct parse_tag_res { parse_tag_res(tag_type type_,const html_tag_data& tag_=html_tag_data()): type(type_),tag(tag_){} parse_tag_res(const parse_tag_res& x):type(x.type),tag(x.tag){} tag_type type; html_tag tag; }; template parse_tag_res parse_tag(ForwardIterator& first,ForwardIterator last) { html_tag_data tag; std::string buf; bool in_quote=false; for(ForwardIterator it=first;it!=last;){ char ch=*it++; if(ch=='>'&&!in_quote){ /* ignore '>'s if inside quotes */ tag_type type; std::string::size_type bname=buf.find_first_not_of("\t\n\r "), ename=bname==std::string::npos? std::string::npos: buf.find_first_of("\t\n\r ",bname), bprop=ename==std::string::npos? std::string::npos: buf.find_first_not_of("\t\n\r ",ename); if(bname==ename){ /* null name */ return parse_tag_res(failure); } else if(buf[bname]=='/'){ /* closing tag */ type=closing; ++bname; } else type=opening; tag.name=buf.substr(bname,ename-bname); std::transform( /* normalize tag name to lower case */ tag.name.begin(),tag.name.end(),tag.name.begin(), (int(*)(int))std::tolower); if(bprop!=std::string::npos){ tag.properties=buf.substr(bprop,buf.size()); } first=it; /* result good, consume the chars */ return parse_tag_res(type,tag); } else{ if(ch=='"')in_quote=!in_quote; buf+=ch; } } return parse_tag_res(failure); /* end reached and found no '>' */ } /* A character context is just a vector containing the tags enclosing the * character, from the outermost level to the innermost. */ typedef std::vector html_context_data; typedef flyweight html_context; /* A character is a char code plus its context. */ struct character_data { character_data(char code_=0,html_context context_=html_context()): code(code_),context(context_){} character_data(const character_data& x):code(x.code),context(x.context){} char code; html_context context; }; bool operator==(const character_data& x,const character_data& y) { return x.code==y.code&&x.context==y.context; } #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) namespace boost{ #endif std::size_t hash_value(const character_data& x) { std::size_t res=0; boost::hash_combine(res,x.code); boost::hash_combine(res,x.context); return res; } #if defined(BOOST_NO_ARGUMENT_DEPENDENT_LOOKUP) } /* namespace boost */ #endif typedef flyweight character; /* scan_html converts HTML code into a stream of contextualized characters. */ template void scan_html(ForwardIterator first,ForwardIterator last,OutputIterator out) { html_context_data context; while(first!=last){ if(*first=='<'){ /* tag found */ ++first; parse_tag_res res=parse_tag(first,last); if(res.type==opening){ /* add to contex */ context.push_back(res.tag); continue; } else if(res.type==closing){ /* remove from context */ /* Pop all tags from the innermost to the matching one; this takes * care of missing s like vg. in

hello

. */ for(html_context_data::reverse_iterator rit=context.rbegin(); rit!=context.rend();++rit){ if(rit->get().name==res.tag.get().name){ context.erase(rit.base()-1,context.end()); break; } } continue; } } *out++=character(*first++,html_context(context)); } } /* HTML-producing utilities */ void print_opening_tag(std::ostream& os,const html_tag_data& x) { os<<"<"<"; } void print_closing_tag(std::ostream& os,const html_tag_data& x) { /* SGML declarations (beginning with '!') are not closed */ if(x.name[0]!='!')os<<""; } /* change_context takes contexts from and to with tags * * from<- c1 ... cn fn+1 ... fm * to <- c1 ... cn tn+1 ... tk * * (that is, they share the first n tags, n might be 0), and * produces code closing fm ... fn+1 and opening tn+1 ... tk. */ template void change_context( const html_context_data& from,const html_context_data& to, OutputIterator out) { std::ostringstream oss; html_context_data::const_iterator it0=from.begin(), it0_end=from.end(), it1=to.begin(), it1_end=to.end(); for(;it0!=it0_end&&it1!=it1_end&&*it0==*it1;++it0,++it1); while(it0_end!=it0)print_closing_tag(oss,*--it0_end); while(it1!=it1_end)print_opening_tag(oss,*it1++); std::string str=oss.str(); std::copy(str.begin(),str.end(),out); } /* produce_html is passed a bunch of contextualized characters and emits * the corresponding HTML. The algorithm is simple: tags are opened and closed * as a result of the context from one character to the following changing. */ template void produce_html(ForwardIterator first,ForwardIterator last,OutputIterator out) { html_context context; while(first!=last){ if(first->get().context!=context){ change_context(context,first->get().context,out); context=first->get().context; } *out++=(first++)->get().code; } change_context(context,html_context(),out); /* close remaining context */ } /* Without these explicit instantiations, MSVC++ 6.5/7.0 does not * find some friend operators in certain contexts. */ character dummy1; html_tag dummy2; int main() { std::cout<<"input html file: "; std::string in; std::getline(std::cin,in); std::ifstream ifs(in.c_str()); if(!ifs){ std::cout<<"can't open "< istrbuf_iterator; std::vector html_source; std::copy( istrbuf_iterator(ifs),istrbuf_iterator(), std::back_inserter(html_source)); /* parse the HTML */ std::vector scanned_html; scan_html( html_source.begin(),html_source.end(),std::back_inserter(scanned_html)); /* Now that we have the text as a vector of contextualized characters, * we can shuffle it around and manipulate in almost any way we please. * For instance, the following reverses the central portion of the doc. */ std::reverse( scanned_html.begin()+scanned_html.size()/4, scanned_html.begin()+3*(scanned_html.size()/4)); /* emit the resulting HTML */ std::cout<<"output html file: "; std::string out; std::getline(std::cin,out); std::ofstream ofs(out.c_str()); if(!ofs){ std::cout<<"can't open "< ostrbuf_iterator; produce_html(scanned_html.begin(),scanned_html.end(),ostrbuf_iterator(ofs)); return 0; }