// // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh) // // Distributed under the Boost Software License, Version 1.0. (See // accompanying file LICENSE_1_0.txt or copy at // http://www.boost.org/LICENSE_1_0.txt) // #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED #include #include #include #include #include #include #include #include #include #include #ifdef BOOST_MSVC # pragma warning(push) # pragma warning(disable : 4275 4251 4231 4660) #endif #include #include #include #include #include #include #include namespace boost { namespace locale { namespace boundary { /// /// \defgroup boundary Boundary Analysis /// /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries /// /// @{ /// /// \cond INTERNAL namespace details { template::iterator_category> struct mapping_traits { typedef typename std::iterator_traits::value_type char_type; static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) { std::basic_string str(b,e); return std::use_facet >(l).map(t,str.c_str(),str.c_str()+str.size()); } }; template struct linear_iterator_traits { static const bool is_linear = is_same::value || is_same::value || is_same::iterator>::value || is_same::const_iterator>::value || is_same::iterator>::value || is_same::const_iterator>::value ; }; template struct mapping_traits { typedef typename std::iterator_traits::value_type char_type; static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l) { index_type result; // // Optimize for most common cases // // C++0x requires that string is continious in memory and all known // string implementations // do this because of c_str() support. // if(linear_iterator_traits::is_linear && b!=e) { char_type const *begin = &*b; char_type const *end = begin + (e-b); index_type tmp=std::use_facet >(l).map(t,begin,end); result.swap(tmp); } else { std::basic_string str(b,e); index_type tmp = std::use_facet >(l).map(t,str.c_str(),str.c_str()+str.size()); result.swap(tmp); } return result; } }; template class mapping { public: typedef BaseIterator base_iterator; typedef typename std::iterator_traits::value_type char_type; mapping(boundary_type type, base_iterator begin, base_iterator end, std::locale const &loc) : index_(new index_type()), begin_(begin), end_(end) { index_type idx=details::mapping_traits::map(type,begin,end,loc); index_->swap(idx); } mapping() { } index_type const &index() const { return *index_; } base_iterator begin() const { return begin_; } base_iterator end() const { return end_; } private: boost::shared_ptr index_; base_iterator begin_,end_; }; template class segment_index_iterator : public boost::iterator_facade< segment_index_iterator, segment, boost::bidirectional_traversal_tag, segment const & > { public: typedef BaseIterator base_iterator; typedef mapping mapping_type; typedef segment segment_type; segment_index_iterator() : current_(0,0),map_(0) { } segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) : map_(map), mask_(mask), full_select_(full_select) { set(p); } segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) : map_(map), mask_(mask), full_select_(full_select) { if(is_begin) set_begin(); else set_end(); } segment_type const &dereference() const { return value_; } bool equal(segment_index_iterator const &other) const { return map_ == other.map_ && current_.second == other.current_.second; } void increment() { std::pair next = current_; if(full_select_) { next.first = next.second; while(next.second < size()) { next.second++; if(valid_offset(next.second)) break; } if(next.second == size()) next.first = next.second - 1; } else { while(next.second < size()) { next.first = next.second; next.second++; if(valid_offset(next.second)) break; } } update_current(next); } void decrement() { std::pair next = current_; if(full_select_) { while(next.second >1) { next.second--; if(valid_offset(next.second)) break; } next.first = next.second; while(next.first >0) { next.first--; if(valid_offset(next.first)) break; } } else { while(next.second >1) { next.second--; if(valid_offset(next.second)) break; } next.first = next.second - 1; } update_current(next); } private: void set_end() { current_.first = size() - 1; current_.second = size(); value_ = segment_type(map_->end(),map_->end(),0); } void set_begin() { current_.first = current_.second = 0; value_ = segment_type(map_->begin(),map_->begin(),0); increment(); } void set(base_iterator p) { size_t dist=std::distance(map_->begin(),p); index_type::const_iterator b=map_->index().begin(),e=map_->index().end(); index_type::const_iterator boundary_point=std::upper_bound(b,e,break_info(dist)); while(boundary_point != e && (boundary_point->rule & mask_)==0) boundary_point++; current_.first = current_.second = boundary_point - b; if(full_select_) { while(current_.first > 0) { current_.first --; if(valid_offset(current_.first)) break; } } else { if(current_.first > 0) current_.first --; } value_.first = map_->begin(); std::advance(value_.first,get_offset(current_.first)); value_.second = value_.first; std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first)); update_rule(); } void update_current(std::pair pos) { std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first); std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second); std::advance(value_.first,first_diff); std::advance(value_.second,second_diff); current_ = pos; update_rule(); } void update_rule() { if(current_.second != size()) { value_.rule(index()[current_.second].rule); } } size_t get_offset(size_t ind) const { if(ind == size()) return index().back().offset; return index()[ind].offset; } bool valid_offset(size_t offset) const { return offset == 0 || offset == size() // make sure we not acess index[size] || (index()[offset].rule & mask_)!=0; } size_t size() const { return index().size(); } index_type const &index() const { return map_->index(); } segment_type value_; std::pair current_; mapping_type const *map_; rule_type mask_; bool full_select_; }; template class boundary_point_index_iterator : public boost::iterator_facade< boundary_point_index_iterator, boundary_point, boost::bidirectional_traversal_tag, boundary_point const & > { public: typedef BaseIterator base_iterator; typedef mapping mapping_type; typedef boundary_point boundary_point_type; boundary_point_index_iterator() : current_(0),map_(0) { } boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) : map_(map), mask_(mask) { if(is_begin) set_begin(); else set_end(); } boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) : map_(map), mask_(mask) { set(p); } boundary_point_type const &dereference() const { return value_; } bool equal(boundary_point_index_iterator const &other) const { return map_ == other.map_ && current_ == other.current_; } void increment() { size_t next = current_; while(next < size()) { next++; if(valid_offset(next)) break; } update_current(next); } void decrement() { size_t next = current_; while(next>0) { next--; if(valid_offset(next)) break; } update_current(next); } private: void set_end() { current_ = size(); value_ = boundary_point_type(map_->end(),0); } void set_begin() { current_ = 0; value_ = boundary_point_type(map_->begin(),0); } void set(base_iterator p) { size_t dist = std::distance(map_->begin(),p); index_type::const_iterator b=index().begin(); index_type::const_iterator e=index().end(); index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist)); if(ptr==index().end()) current_=size()-1; else current_=ptr - index().begin(); while(!valid_offset(current_)) current_ ++; std::ptrdiff_t diff = get_offset(current_) - dist; std::advance(p,diff); value_.iterator(p); update_rule(); } void update_current(size_t pos) { std::ptrdiff_t diff = get_offset(pos) - get_offset(current_); base_iterator i=value_.iterator(); std::advance(i,diff); current_ = pos; value_.iterator(i); update_rule(); } void update_rule() { if(current_ != size()) { value_.rule(index()[current_].rule); } } size_t get_offset(size_t ind) const { if(ind == size()) return index().back().offset; return index()[ind].offset; } bool valid_offset(size_t offset) const { return offset == 0 || offset + 1 >= size() // last and first are always valid regardless of mark || (index()[offset].rule & mask_)!=0; } size_t size() const { return index().size(); } index_type const &index() const { return map_->index(); } boundary_point_type value_; size_t current_; mapping_type const *map_; rule_type mask_; }; } // details /// \endcond template class segment_index; template class boundary_point_index; /// /// \brief This class holds an index of segments in the text range and allows to iterate over them /// /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators /// to the \ref segment objects. /// /// It provides two options on way of selecting segments: /// /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to /// various masks %as \ref word_any. /// \n /// The default is to select any types of boundaries. /// \n /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators /// would iterate only over the words containing Kana letters and \ref word_any would select all types of /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead /// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?". /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous /// %boundary point does not fit the selected rule. /// \n /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?". /// \n /// This text contains three %boundary points separating it to sentences by different rules: /// - The exclamation mark "!" ends the sentence "Hello!" /// - The line feed that splits the sentence "How\nare you?" into two parts. /// - The question mark that ends the second sentence. /// \n /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include /// all the text up to previous valid %boundary point and would return two expected sentences: /// "Hello!" and "How\nare you?". /// /// This class allows to find a segment according to the given iterator in range using \ref find() member /// function. /// /// \note /// /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text /// invalidates existing iterators and they can't be used any more. /// - segment_index can be created from boundary_point_index or other segment_index that was created with /// same \ref boundary_type. This is very fast operation %as they shared same index /// and it does not require its regeneration. /// /// \see /// /// - \ref boundary_point_index /// - \ref segment /// - \ref boundary_point /// template class segment_index { public: /// /// The type of the iterator used to iterate over the original text /// typedef BaseIterator base_iterator; #ifdef BOOST_LOCALE_DOXYGEN /// /// The bidirectional iterator that iterates over \ref value_type objects. /// /// - The iterators may be invalidated by use of any non-const member function /// including but not limited to \ref rule(rule_type) and \ref full_select(bool). /// - The returned value_type object is valid %as long %as iterator points to it. /// So this following code is wrong %as t used after p was updated: /// \code /// segment_index::iterator p=index.begin(); /// segment &t = *p; /// ++p; /// cout << t.str() << endl; /// \endcode /// typedef unspecified_iterator_type iterator; /// /// \copydoc iterator /// typedef unspecified_iterator_type const_iterator; #else typedef details::segment_index_iterator iterator; typedef details::segment_index_iterator const_iterator; #endif /// /// The type dereferenced by the \ref iterator and \ref const_iterator. It is /// an object that represents selected segment. /// typedef segment value_type; /// /// Default constructor. /// /// \note /// /// When this object is constructed by default it does not include a valid index, thus /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined /// behavior /// segment_index() : mask_(0xFFFFFFFFu),full_select_(false) { } /// /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text /// in range [begin,end) using a rule \a mask for locale \a loc. /// segment_index(boundary_type type, base_iterator begin, base_iterator end, rule_type mask, std::locale const &loc=std::locale()) : map_(type,begin,end,loc), mask_(mask), full_select_(false) { } /// /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc. /// segment_index(boundary_type type, base_iterator begin, base_iterator end, std::locale const &loc=std::locale()) : map_(type,begin,end,loc), mask_(0xFFFFFFFFu), full_select_(false) { } /// /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information /// and used default rule (all possible segments) /// /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text /// range it is much better to create one from another rather then indexing the same /// range twice. /// /// \note \ref rule() flags are not copied /// segment_index(boundary_point_index const &); /// /// Copy an index from a \ref boundary_point_index. It copies all indexing information /// and uses the default rule (all possible segments) /// /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text /// range it is much better to create one from another rather then indexing the same /// range twice. /// /// \note \ref rule() flags are not copied /// segment_index const &operator = (boundary_point_index const &); /// /// Create a new index for %boundary analysis \ref boundary_type "type" of the text /// in range [begin,end) for locale \a loc. /// /// \note \ref rule() and \ref full_select() remain unchanged. /// void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) { map_ = mapping_type(type,begin,end,loc); } /// /// Get the \ref iterator on the beginning of the segments range. /// /// Preconditions: the segment_index should have a mapping /// /// \note /// /// The returned iterator is invalidated by access to any non-const member functions of this object /// iterator begin() const { return iterator(true,&map_,mask_,full_select_); } /// /// Get the \ref iterator on the ending of the segments range. /// /// Preconditions: the segment_index should have a mapping /// /// The returned iterator is invalidated by access to any non-const member functions of this object /// iterator end() const { return iterator(false,&map_,mask_,full_select_); } /// /// Find a first valid segment following a position \a p. /// /// If \a p is inside a valid segment this segment is selected: /// /// For example: For \ref word %boundary analysis with \ref word_any rule(): /// /// - "to| be or ", would point to "be", /// - "t|o be or ", would point to "to", /// - "to be or| ", would point to end. /// /// /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator /// to the text in the mapped range. /// /// The returned iterator is invalidated by access to any non-const member functions of this object /// iterator find(base_iterator p) const { return iterator(p,&map_,mask_,full_select_); } /// /// Get the mask of rules that are used /// rule_type rule() const { return mask_; } /// /// Set the mask of rules that are used /// void rule(rule_type v) { mask_ = v; } /// /// Get the full_select property value - should segment include in the range /// values that not belong to specific \ref rule() or not. /// /// The default value is false. /// /// For example for \ref sentence %boundary with rule \ref sentence_term the segments /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the /// following part "are you?" /// bool full_select() const { return full_select_; } /// /// Set the full_select property value - should segment include in the range /// values that not belong to specific \ref rule() or not. /// /// The default value is false. /// /// For example for \ref sentence %boundary with rule \ref sentence_term the segments /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select() /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the /// following part "are you?" /// void full_select(bool v) { full_select_ = v; } private: friend class boundary_point_index; typedef details::mapping mapping_type; mapping_type map_; rule_type mask_; bool full_select_; }; /// /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating /// over them. /// /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators /// to the \ref boundary_point objects. /// /// It provides an option that affects selecting %boundary points according to different rules: /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific /// types of %boundary points like \ref sentence_term. /// /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default /// rule is used the %boundary points would be: /// /// - "|Hello! How\nare you?" /// - "Hello! |How\nare you?" /// - "Hello! How\n|are you?" /// - "Hello! How\nare you?|" /// /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be: /// /// - "|Hello! How\nare you?" /// - "Hello! |How\nare you?" /// - "Hello! How\nare you?|" /// /// Such that a %boundary point defined by a line feed character would be ignored. /// /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member /// function. /// /// \note /// - Even an empty text range [x,x) considered to have a one %boundary point x. /// - \a a and \a b points of the range [a,b) are always considered %boundary points /// regardless the rules used. /// - Changing any of the option \ref rule() or course re-indexing the text /// invalidates existing iterators and they can't be used any more. /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with /// same \ref boundary_type. This is very fast operation %as they shared same index /// and it does not require its regeneration. /// /// \see /// /// - \ref segment_index /// - \ref boundary_point /// - \ref segment /// template class boundary_point_index { public: /// /// The type of the iterator used to iterate over the original text /// typedef BaseIterator base_iterator; #ifdef BOOST_LOCALE_DOXYGEN /// /// The bidirectional iterator that iterates over \ref value_type objects. /// /// - The iterators may be invalidated by use of any non-const member function /// including but not limited to \ref rule(rule_type) member function. /// - The returned value_type object is valid %as long %as iterator points to it. /// So this following code is wrong %as t used after p was updated: /// \code /// boundary_point_index::iterator p=index.begin(); /// boundary_point &t = *p; /// ++p; /// rule_type r = t->rule(); /// \endcode /// typedef unspecified_iterator_type iterator; /// /// \copydoc iterator /// typedef unspecified_iterator_type const_iterator; #else typedef details::boundary_point_index_iterator iterator; typedef details::boundary_point_index_iterator const_iterator; #endif /// /// The type dereferenced by the \ref iterator and \ref const_iterator. It is /// an object that represents the selected \ref boundary_point "boundary point". /// typedef boundary_point value_type; /// /// Default constructor. /// /// \note /// /// When this object is constructed by default it does not include a valid index, thus /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined /// behavior /// boundary_point_index() : mask_(0xFFFFFFFFu) { } /// /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text /// in range [begin,end) using a rule \a mask for locale \a loc. /// boundary_point_index(boundary_type type, base_iterator begin, base_iterator end, rule_type mask, std::locale const &loc=std::locale()) : map_(type,begin,end,loc), mask_(mask) { } /// /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc. /// boundary_point_index(boundary_type type, base_iterator begin, base_iterator end, std::locale const &loc=std::locale()) : map_(type,begin,end,loc), mask_(0xFFFFFFFFu) { } /// /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information /// and uses the default rule (all possible %boundary points) /// /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text /// range it is much better to create one from another rather then indexing the same /// range twice. /// /// \note \ref rule() flags are not copied /// boundary_point_index(segment_index const &other); /// /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information /// and keeps the current \ref rule() unchanged /// /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text /// range it is much better to create one from another rather then indexing the same /// range twice. /// /// \note \ref rule() flags are not copied /// boundary_point_index const &operator=(segment_index const &other); /// /// Create a new index for %boundary analysis \ref boundary_type "type" of the text /// in range [begin,end) for locale \a loc. /// /// \note \ref rule() remains unchanged. /// void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale()) { map_ = mapping_type(type,begin,end,loc); } /// /// Get the \ref iterator on the beginning of the %boundary points range. /// /// Preconditions: this boundary_point_index should have a mapping /// /// \note /// /// The returned iterator is invalidated by access to any non-const member functions of this object /// iterator begin() const { return iterator(true,&map_,mask_); } /// /// Get the \ref iterator on the ending of the %boundary points range. /// /// Preconditions: this boundary_point_index should have a mapping /// /// \note /// /// The returned iterator is invalidated by access to any non-const member functions of this object /// iterator end() const { return iterator(false,&map_,mask_); } /// /// Find a first valid %boundary point on a position \a p or following it. /// /// For example: For \ref word %boundary analysis of the text "to be or" /// /// - "|to be", would return %boundary point at "|to be", /// - "t|o be", would point to "to| be" /// /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator /// to the text in the mapped range. /// /// The returned iterator is invalidated by access to any non-const member functions of this object /// iterator find(base_iterator p) const { return iterator(p,&map_,mask_); } /// /// Get the mask of rules that are used /// rule_type rule() const { return mask_; } /// /// Set the mask of rules that are used /// void rule(rule_type v) { mask_ = v; } private: friend class segment_index; typedef details::mapping mapping_type; mapping_type map_; rule_type mask_; }; /// \cond INTERNAL template segment_index::segment_index(boundary_point_index const &other) : map_(other.map_), mask_(0xFFFFFFFFu), full_select_(false) { } template boundary_point_index::boundary_point_index(segment_index const &other) : map_(other.map_), mask_(0xFFFFFFFFu) { } template segment_index const &segment_index::operator=(boundary_point_index const &other) { map_ = other.map_; return *this; } template boundary_point_index const &boundary_point_index::operator=(segment_index const &other) { map_ = other.map_; return *this; } /// \endcond typedef segment_index ssegment_index; ///< convenience typedef typedef segment_index wssegment_index; ///< convenience typedef #ifdef BOOST_LOCALE_ENABLE_CHAR16_T typedef segment_index u16ssegment_index;///< convenience typedef #endif #ifdef BOOST_LOCALE_ENABLE_CHAR32_T typedef segment_index u32ssegment_index;///< convenience typedef #endif typedef segment_index csegment_index; ///< convenience typedef typedef segment_index wcsegment_index; ///< convenience typedef #ifdef BOOST_LOCALE_ENABLE_CHAR16_T typedef segment_index u16csegment_index; ///< convenience typedef #endif #ifdef BOOST_LOCALE_ENABLE_CHAR32_T typedef segment_index u32csegment_index; ///< convenience typedef #endif typedef boundary_point_index sboundary_point_index;///< convenience typedef typedef boundary_point_index wsboundary_point_index;///< convenience typedef #ifdef BOOST_LOCALE_ENABLE_CHAR16_T typedef boundary_point_index u16sboundary_point_index;///< convenience typedef #endif #ifdef BOOST_LOCALE_ENABLE_CHAR32_T typedef boundary_point_index u32sboundary_point_index;///< convenience typedef #endif typedef boundary_point_index cboundary_point_index; ///< convenience typedef typedef boundary_point_index wcboundary_point_index; ///< convenience typedef #ifdef BOOST_LOCALE_ENABLE_CHAR16_T typedef boundary_point_index u16cboundary_point_index;///< convenience typedef #endif #ifdef BOOST_LOCALE_ENABLE_CHAR32_T typedef boundary_point_index u32cboundary_point_index;///< convenience typedef #endif } // boundary } // locale } // boost /// /// \example boundary.cpp /// Example of using segment_index /// \example wboundary.cpp /// Example of using segment_index over wide strings /// #ifdef BOOST_MSVC #pragma warning(pop) #endif #endif // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4