index.hpp 46 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092
  1. //
  2. // Copyright (c) 2009-2011 Artyom Beilis (Tonkikh)
  3. //
  4. // Distributed under the Boost Software License, Version 1.0. (See
  5. // accompanying file LICENSE_1_0.txt or copy at
  6. // http://www.boost.org/LICENSE_1_0.txt)
  7. //
  8. #ifndef BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  9. #define BOOST_LOCALE_BOUNDARY_INDEX_HPP_INCLUDED
  10. #include <boost/locale/config.hpp>
  11. #include <boost/locale/boundary/types.hpp>
  12. #include <boost/locale/boundary/facets.hpp>
  13. #include <boost/locale/boundary/segment.hpp>
  14. #include <boost/locale/boundary/boundary_point.hpp>
  15. #include <boost/iterator/iterator_facade.hpp>
  16. #include <boost/type_traits/is_same.hpp>
  17. #include <boost/shared_ptr.hpp>
  18. #include <boost/cstdint.hpp>
  19. #include <boost/assert.hpp>
  20. #ifdef BOOST_MSVC
  21. # pragma warning(push)
  22. # pragma warning(disable : 4275 4251 4231 4660)
  23. #endif
  24. #include <string>
  25. #include <locale>
  26. #include <vector>
  27. #include <iterator>
  28. #include <algorithm>
  29. #include <stdexcept>
  30. #include <iostream>
  31. namespace boost {
  32. namespace locale {
  33. namespace boundary {
  34. ///
  35. /// \defgroup boundary Boundary Analysis
  36. ///
  37. /// This module contains all operations required for %boundary analysis of text: character, word, like and sentence boundaries
  38. ///
  39. /// @{
  40. ///
  41. /// \cond INTERNAL
  42. namespace details {
  43. template<typename IteratorType,typename CategoryType = typename std::iterator_traits<IteratorType>::iterator_category>
  44. struct mapping_traits {
  45. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  46. static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
  47. {
  48. std::basic_string<char_type> str(b,e);
  49. return std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size());
  50. }
  51. };
  52. template<typename CharType,typename SomeIteratorType>
  53. struct linear_iterator_traits {
  54. static const bool is_linear =
  55. is_same<SomeIteratorType,CharType*>::value
  56. || is_same<SomeIteratorType,CharType const*>::value
  57. || is_same<SomeIteratorType,typename std::basic_string<CharType>::iterator>::value
  58. || is_same<SomeIteratorType,typename std::basic_string<CharType>::const_iterator>::value
  59. || is_same<SomeIteratorType,typename std::vector<CharType>::iterator>::value
  60. || is_same<SomeIteratorType,typename std::vector<CharType>::const_iterator>::value
  61. ;
  62. };
  63. template<typename IteratorType>
  64. struct mapping_traits<IteratorType,std::random_access_iterator_tag> {
  65. typedef typename std::iterator_traits<IteratorType>::value_type char_type;
  66. static index_type map(boundary_type t,IteratorType b,IteratorType e,std::locale const &l)
  67. {
  68. index_type result;
  69. //
  70. // Optimize for most common cases
  71. //
  72. // C++0x requires that string is continious in memory and all known
  73. // string implementations
  74. // do this because of c_str() support.
  75. //
  76. if(linear_iterator_traits<char_type,IteratorType>::is_linear && b!=e)
  77. {
  78. char_type const *begin = &*b;
  79. char_type const *end = begin + (e-b);
  80. index_type tmp=std::use_facet<boundary_indexing<char_type> >(l).map(t,begin,end);
  81. result.swap(tmp);
  82. }
  83. else {
  84. std::basic_string<char_type> str(b,e);
  85. index_type tmp = std::use_facet<boundary_indexing<char_type> >(l).map(t,str.c_str(),str.c_str()+str.size());
  86. result.swap(tmp);
  87. }
  88. return result;
  89. }
  90. };
  91. template<typename BaseIterator>
  92. class mapping {
  93. public:
  94. typedef BaseIterator base_iterator;
  95. typedef typename std::iterator_traits<base_iterator>::value_type char_type;
  96. mapping(boundary_type type,
  97. base_iterator begin,
  98. base_iterator end,
  99. std::locale const &loc)
  100. :
  101. index_(new index_type()),
  102. begin_(begin),
  103. end_(end)
  104. {
  105. index_type idx=details::mapping_traits<base_iterator>::map(type,begin,end,loc);
  106. index_->swap(idx);
  107. }
  108. mapping()
  109. {
  110. }
  111. index_type const &index() const
  112. {
  113. return *index_;
  114. }
  115. base_iterator begin() const
  116. {
  117. return begin_;
  118. }
  119. base_iterator end() const
  120. {
  121. return end_;
  122. }
  123. private:
  124. boost::shared_ptr<index_type> index_;
  125. base_iterator begin_,end_;
  126. };
  127. template<typename BaseIterator>
  128. class segment_index_iterator :
  129. public boost::iterator_facade<
  130. segment_index_iterator<BaseIterator>,
  131. segment<BaseIterator>,
  132. boost::bidirectional_traversal_tag,
  133. segment<BaseIterator> const &
  134. >
  135. {
  136. public:
  137. typedef BaseIterator base_iterator;
  138. typedef mapping<base_iterator> mapping_type;
  139. typedef segment<base_iterator> segment_type;
  140. segment_index_iterator() : current_(0,0),map_(0)
  141. {
  142. }
  143. segment_index_iterator(base_iterator p,mapping_type const *map,rule_type mask,bool full_select) :
  144. map_(map),
  145. mask_(mask),
  146. full_select_(full_select)
  147. {
  148. set(p);
  149. }
  150. segment_index_iterator(bool is_begin,mapping_type const *map,rule_type mask,bool full_select) :
  151. map_(map),
  152. mask_(mask),
  153. full_select_(full_select)
  154. {
  155. if(is_begin)
  156. set_begin();
  157. else
  158. set_end();
  159. }
  160. segment_type const &dereference() const
  161. {
  162. return value_;
  163. }
  164. bool equal(segment_index_iterator const &other) const
  165. {
  166. return map_ == other.map_ && current_.second == other.current_.second;
  167. }
  168. void increment()
  169. {
  170. std::pair<size_t,size_t> next = current_;
  171. if(full_select_) {
  172. next.first = next.second;
  173. while(next.second < size()) {
  174. next.second++;
  175. if(valid_offset(next.second))
  176. break;
  177. }
  178. if(next.second == size())
  179. next.first = next.second - 1;
  180. }
  181. else {
  182. while(next.second < size()) {
  183. next.first = next.second;
  184. next.second++;
  185. if(valid_offset(next.second))
  186. break;
  187. }
  188. }
  189. update_current(next);
  190. }
  191. void decrement()
  192. {
  193. std::pair<size_t,size_t> next = current_;
  194. if(full_select_) {
  195. while(next.second >1) {
  196. next.second--;
  197. if(valid_offset(next.second))
  198. break;
  199. }
  200. next.first = next.second;
  201. while(next.first >0) {
  202. next.first--;
  203. if(valid_offset(next.first))
  204. break;
  205. }
  206. }
  207. else {
  208. while(next.second >1) {
  209. next.second--;
  210. if(valid_offset(next.second))
  211. break;
  212. }
  213. next.first = next.second - 1;
  214. }
  215. update_current(next);
  216. }
  217. private:
  218. void set_end()
  219. {
  220. current_.first = size() - 1;
  221. current_.second = size();
  222. value_ = segment_type(map_->end(),map_->end(),0);
  223. }
  224. void set_begin()
  225. {
  226. current_.first = current_.second = 0;
  227. value_ = segment_type(map_->begin(),map_->begin(),0);
  228. increment();
  229. }
  230. void set(base_iterator p)
  231. {
  232. size_t dist=std::distance(map_->begin(),p);
  233. index_type::const_iterator b=map_->index().begin(),e=map_->index().end();
  234. index_type::const_iterator
  235. boundary_point=std::upper_bound(b,e,break_info(dist));
  236. while(boundary_point != e && (boundary_point->rule & mask_)==0)
  237. boundary_point++;
  238. current_.first = current_.second = boundary_point - b;
  239. if(full_select_) {
  240. while(current_.first > 0) {
  241. current_.first --;
  242. if(valid_offset(current_.first))
  243. break;
  244. }
  245. }
  246. else {
  247. if(current_.first > 0)
  248. current_.first --;
  249. }
  250. value_.first = map_->begin();
  251. std::advance(value_.first,get_offset(current_.first));
  252. value_.second = value_.first;
  253. std::advance(value_.second,get_offset(current_.second) - get_offset(current_.first));
  254. update_rule();
  255. }
  256. void update_current(std::pair<size_t,size_t> pos)
  257. {
  258. std::ptrdiff_t first_diff = get_offset(pos.first) - get_offset(current_.first);
  259. std::ptrdiff_t second_diff = get_offset(pos.second) - get_offset(current_.second);
  260. std::advance(value_.first,first_diff);
  261. std::advance(value_.second,second_diff);
  262. current_ = pos;
  263. update_rule();
  264. }
  265. void update_rule()
  266. {
  267. if(current_.second != size()) {
  268. value_.rule(index()[current_.second].rule);
  269. }
  270. }
  271. size_t get_offset(size_t ind) const
  272. {
  273. if(ind == size())
  274. return index().back().offset;
  275. return index()[ind].offset;
  276. }
  277. bool valid_offset(size_t offset) const
  278. {
  279. return offset == 0
  280. || offset == size() // make sure we not acess index[size]
  281. || (index()[offset].rule & mask_)!=0;
  282. }
  283. size_t size() const
  284. {
  285. return index().size();
  286. }
  287. index_type const &index() const
  288. {
  289. return map_->index();
  290. }
  291. segment_type value_;
  292. std::pair<size_t,size_t> current_;
  293. mapping_type const *map_;
  294. rule_type mask_;
  295. bool full_select_;
  296. };
  297. template<typename BaseIterator>
  298. class boundary_point_index_iterator :
  299. public boost::iterator_facade<
  300. boundary_point_index_iterator<BaseIterator>,
  301. boundary_point<BaseIterator>,
  302. boost::bidirectional_traversal_tag,
  303. boundary_point<BaseIterator> const &
  304. >
  305. {
  306. public:
  307. typedef BaseIterator base_iterator;
  308. typedef mapping<base_iterator> mapping_type;
  309. typedef boundary_point<base_iterator> boundary_point_type;
  310. boundary_point_index_iterator() : current_(0),map_(0)
  311. {
  312. }
  313. boundary_point_index_iterator(bool is_begin,mapping_type const *map,rule_type mask) :
  314. map_(map),
  315. mask_(mask)
  316. {
  317. if(is_begin)
  318. set_begin();
  319. else
  320. set_end();
  321. }
  322. boundary_point_index_iterator(base_iterator p,mapping_type const *map,rule_type mask) :
  323. map_(map),
  324. mask_(mask)
  325. {
  326. set(p);
  327. }
  328. boundary_point_type const &dereference() const
  329. {
  330. return value_;
  331. }
  332. bool equal(boundary_point_index_iterator const &other) const
  333. {
  334. return map_ == other.map_ && current_ == other.current_;
  335. }
  336. void increment()
  337. {
  338. size_t next = current_;
  339. while(next < size()) {
  340. next++;
  341. if(valid_offset(next))
  342. break;
  343. }
  344. update_current(next);
  345. }
  346. void decrement()
  347. {
  348. size_t next = current_;
  349. while(next>0) {
  350. next--;
  351. if(valid_offset(next))
  352. break;
  353. }
  354. update_current(next);
  355. }
  356. private:
  357. void set_end()
  358. {
  359. current_ = size();
  360. value_ = boundary_point_type(map_->end(),0);
  361. }
  362. void set_begin()
  363. {
  364. current_ = 0;
  365. value_ = boundary_point_type(map_->begin(),0);
  366. }
  367. void set(base_iterator p)
  368. {
  369. size_t dist = std::distance(map_->begin(),p);
  370. index_type::const_iterator b=index().begin();
  371. index_type::const_iterator e=index().end();
  372. index_type::const_iterator ptr = std::lower_bound(b,e,break_info(dist));
  373. if(ptr==index().end())
  374. current_=size()-1;
  375. else
  376. current_=ptr - index().begin();
  377. while(!valid_offset(current_))
  378. current_ ++;
  379. std::ptrdiff_t diff = get_offset(current_) - dist;
  380. std::advance(p,diff);
  381. value_.iterator(p);
  382. update_rule();
  383. }
  384. void update_current(size_t pos)
  385. {
  386. std::ptrdiff_t diff = get_offset(pos) - get_offset(current_);
  387. base_iterator i=value_.iterator();
  388. std::advance(i,diff);
  389. current_ = pos;
  390. value_.iterator(i);
  391. update_rule();
  392. }
  393. void update_rule()
  394. {
  395. if(current_ != size()) {
  396. value_.rule(index()[current_].rule);
  397. }
  398. }
  399. size_t get_offset(size_t ind) const
  400. {
  401. if(ind == size())
  402. return index().back().offset;
  403. return index()[ind].offset;
  404. }
  405. bool valid_offset(size_t offset) const
  406. {
  407. return offset == 0
  408. || offset + 1 >= size() // last and first are always valid regardless of mark
  409. || (index()[offset].rule & mask_)!=0;
  410. }
  411. size_t size() const
  412. {
  413. return index().size();
  414. }
  415. index_type const &index() const
  416. {
  417. return map_->index();
  418. }
  419. boundary_point_type value_;
  420. size_t current_;
  421. mapping_type const *map_;
  422. rule_type mask_;
  423. };
  424. } // details
  425. /// \endcond
  426. template<typename BaseIterator>
  427. class segment_index;
  428. template<typename BaseIterator>
  429. class boundary_point_index;
  430. ///
  431. /// \brief This class holds an index of segments in the text range and allows to iterate over them
  432. ///
  433. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  434. /// to the \ref segment objects.
  435. ///
  436. /// It provides two options on way of selecting segments:
  437. ///
  438. /// - \ref rule(rule_type mask) - a mask that allows to select only specific types of segments according to
  439. /// various masks %as \ref word_any.
  440. /// \n
  441. /// The default is to select any types of boundaries.
  442. /// \n
  443. /// For example: using word %boundary analysis, when the provided mask is \ref word_kana then the iterators
  444. /// would iterate only over the words containing Kana letters and \ref word_any would select all types of
  445. /// words excluding ranges that consist of white space and punctuation marks. So iterating over the text
  446. /// "to be or not to be?" with \ref word_any rule would return segments "to", "be", "or", "not", "to", "be", instead
  447. /// of default "to", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", "?".
  448. /// - \ref full_select(bool how) - a flag that defines the way a range is selected if the rule of the previous
  449. /// %boundary point does not fit the selected rule.
  450. /// \n
  451. /// For example: We want to fetch all sentences from the following text: "Hello! How\nare you?".
  452. /// \n
  453. /// This text contains three %boundary points separating it to sentences by different rules:
  454. /// - The exclamation mark "!" ends the sentence "Hello!"
  455. /// - The line feed that splits the sentence "How\nare you?" into two parts.
  456. /// - The question mark that ends the second sentence.
  457. /// \n
  458. /// If you would only change the \ref rule() to \ref sentence_term then the segment_index would
  459. /// provide two sentences "Hello!" and "are you?" %as only them actually terminated with required
  460. /// terminator "!" or "?". But changing \ref full_select() to true, the selected segment would include
  461. /// all the text up to previous valid %boundary point and would return two expected sentences:
  462. /// "Hello!" and "How\nare you?".
  463. ///
  464. /// This class allows to find a segment according to the given iterator in range using \ref find() member
  465. /// function.
  466. ///
  467. /// \note
  468. ///
  469. /// - Changing any of the options - \ref rule() or \ref full_select() and of course re-indexing the text
  470. /// invalidates existing iterators and they can't be used any more.
  471. /// - segment_index can be created from boundary_point_index or other segment_index that was created with
  472. /// same \ref boundary_type. This is very fast operation %as they shared same index
  473. /// and it does not require its regeneration.
  474. ///
  475. /// \see
  476. ///
  477. /// - \ref boundary_point_index
  478. /// - \ref segment
  479. /// - \ref boundary_point
  480. ///
  481. template<typename BaseIterator>
  482. class segment_index {
  483. public:
  484. ///
  485. /// The type of the iterator used to iterate over the original text
  486. ///
  487. typedef BaseIterator base_iterator;
  488. #ifdef BOOST_LOCALE_DOXYGEN
  489. ///
  490. /// The bidirectional iterator that iterates over \ref value_type objects.
  491. ///
  492. /// - The iterators may be invalidated by use of any non-const member function
  493. /// including but not limited to \ref rule(rule_type) and \ref full_select(bool).
  494. /// - The returned value_type object is valid %as long %as iterator points to it.
  495. /// So this following code is wrong %as t used after p was updated:
  496. /// \code
  497. /// segment_index<some_iterator>::iterator p=index.begin();
  498. /// segment<some_iterator> &t = *p;
  499. /// ++p;
  500. /// cout << t.str() << endl;
  501. /// \endcode
  502. ///
  503. typedef unspecified_iterator_type iterator;
  504. ///
  505. /// \copydoc iterator
  506. ///
  507. typedef unspecified_iterator_type const_iterator;
  508. #else
  509. typedef details::segment_index_iterator<base_iterator> iterator;
  510. typedef details::segment_index_iterator<base_iterator> const_iterator;
  511. #endif
  512. ///
  513. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  514. /// an object that represents selected segment.
  515. ///
  516. typedef segment<base_iterator> value_type;
  517. ///
  518. /// Default constructor.
  519. ///
  520. /// \note
  521. ///
  522. /// When this object is constructed by default it does not include a valid index, thus
  523. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  524. /// behavior
  525. ///
  526. segment_index() : mask_(0xFFFFFFFFu),full_select_(false)
  527. {
  528. }
  529. ///
  530. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  531. /// in range [begin,end) using a rule \a mask for locale \a loc.
  532. ///
  533. segment_index(boundary_type type,
  534. base_iterator begin,
  535. base_iterator end,
  536. rule_type mask,
  537. std::locale const &loc=std::locale())
  538. :
  539. map_(type,begin,end,loc),
  540. mask_(mask),
  541. full_select_(false)
  542. {
  543. }
  544. ///
  545. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  546. /// in range [begin,end) selecting all possible segments (full mask) for locale \a loc.
  547. ///
  548. segment_index(boundary_type type,
  549. base_iterator begin,
  550. base_iterator end,
  551. std::locale const &loc=std::locale())
  552. :
  553. map_(type,begin,end,loc),
  554. mask_(0xFFFFFFFFu),
  555. full_select_(false)
  556. {
  557. }
  558. ///
  559. /// Create a segment_index from a \ref boundary_point_index. It copies all indexing information
  560. /// and used default rule (all possible segments)
  561. ///
  562. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  563. /// range it is much better to create one from another rather then indexing the same
  564. /// range twice.
  565. ///
  566. /// \note \ref rule() flags are not copied
  567. ///
  568. segment_index(boundary_point_index<base_iterator> const &);
  569. ///
  570. /// Copy an index from a \ref boundary_point_index. It copies all indexing information
  571. /// and uses the default rule (all possible segments)
  572. ///
  573. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  574. /// range it is much better to create one from another rather then indexing the same
  575. /// range twice.
  576. ///
  577. /// \note \ref rule() flags are not copied
  578. ///
  579. segment_index const &operator = (boundary_point_index<base_iterator> const &);
  580. ///
  581. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  582. /// in range [begin,end) for locale \a loc.
  583. ///
  584. /// \note \ref rule() and \ref full_select() remain unchanged.
  585. ///
  586. void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale())
  587. {
  588. map_ = mapping_type(type,begin,end,loc);
  589. }
  590. ///
  591. /// Get the \ref iterator on the beginning of the segments range.
  592. ///
  593. /// Preconditions: the segment_index should have a mapping
  594. ///
  595. /// \note
  596. ///
  597. /// The returned iterator is invalidated by access to any non-const member functions of this object
  598. ///
  599. iterator begin() const
  600. {
  601. return iterator(true,&map_,mask_,full_select_);
  602. }
  603. ///
  604. /// Get the \ref iterator on the ending of the segments range.
  605. ///
  606. /// Preconditions: the segment_index should have a mapping
  607. ///
  608. /// The returned iterator is invalidated by access to any non-const member functions of this object
  609. ///
  610. iterator end() const
  611. {
  612. return iterator(false,&map_,mask_,full_select_);
  613. }
  614. ///
  615. /// Find a first valid segment following a position \a p.
  616. ///
  617. /// If \a p is inside a valid segment this segment is selected:
  618. ///
  619. /// For example: For \ref word %boundary analysis with \ref word_any rule():
  620. ///
  621. /// - "to| be or ", would point to "be",
  622. /// - "t|o be or ", would point to "to",
  623. /// - "to be or| ", would point to end.
  624. ///
  625. ///
  626. /// Preconditions: the segment_index should have a mapping and \a p should be valid iterator
  627. /// to the text in the mapped range.
  628. ///
  629. /// The returned iterator is invalidated by access to any non-const member functions of this object
  630. ///
  631. iterator find(base_iterator p) const
  632. {
  633. return iterator(p,&map_,mask_,full_select_);
  634. }
  635. ///
  636. /// Get the mask of rules that are used
  637. ///
  638. rule_type rule() const
  639. {
  640. return mask_;
  641. }
  642. ///
  643. /// Set the mask of rules that are used
  644. ///
  645. void rule(rule_type v)
  646. {
  647. mask_ = v;
  648. }
  649. ///
  650. /// Get the full_select property value - should segment include in the range
  651. /// values that not belong to specific \ref rule() or not.
  652. ///
  653. /// The default value is false.
  654. ///
  655. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  656. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  657. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  658. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  659. /// following part "are you?"
  660. ///
  661. bool full_select() const
  662. {
  663. return full_select_;
  664. }
  665. ///
  666. /// Set the full_select property value - should segment include in the range
  667. /// values that not belong to specific \ref rule() or not.
  668. ///
  669. /// The default value is false.
  670. ///
  671. /// For example for \ref sentence %boundary with rule \ref sentence_term the segments
  672. /// of text "Hello! How\nare you?" are "Hello!\", "are you?" when full_select() is false
  673. /// because "How\n" is selected %as sentence by a rule spits the text by line feed. If full_select()
  674. /// is true the returned segments are "Hello! ", "How\nare you?" where "How\n" is joined with the
  675. /// following part "are you?"
  676. ///
  677. void full_select(bool v)
  678. {
  679. full_select_ = v;
  680. }
  681. private:
  682. friend class boundary_point_index<base_iterator>;
  683. typedef details::mapping<base_iterator> mapping_type;
  684. mapping_type map_;
  685. rule_type mask_;
  686. bool full_select_;
  687. };
  688. ///
  689. /// \brief This class holds an index of \ref boundary_point "boundary points" and allows iterating
  690. /// over them.
  691. ///
  692. /// This class is provides \ref begin() and \ref end() member functions that return bidirectional iterators
  693. /// to the \ref boundary_point objects.
  694. ///
  695. /// It provides an option that affects selecting %boundary points according to different rules:
  696. /// using \ref rule(rule_type mask) member function. It allows to set a mask that select only specific
  697. /// types of %boundary points like \ref sentence_term.
  698. ///
  699. /// For example for a sentence %boundary analysis of a text "Hello! How\nare you?" when the default
  700. /// rule is used the %boundary points would be:
  701. ///
  702. /// - "|Hello! How\nare you?"
  703. /// - "Hello! |How\nare you?"
  704. /// - "Hello! How\n|are you?"
  705. /// - "Hello! How\nare you?|"
  706. ///
  707. /// However if \ref rule() is set to \ref sentence_term then the selected %boundary points would be:
  708. ///
  709. /// - "|Hello! How\nare you?"
  710. /// - "Hello! |How\nare you?"
  711. /// - "Hello! How\nare you?|"
  712. ///
  713. /// Such that a %boundary point defined by a line feed character would be ignored.
  714. ///
  715. /// This class allows to find a boundary_point according to the given iterator in range using \ref find() member
  716. /// function.
  717. ///
  718. /// \note
  719. /// - Even an empty text range [x,x) considered to have a one %boundary point x.
  720. /// - \a a and \a b points of the range [a,b) are always considered %boundary points
  721. /// regardless the rules used.
  722. /// - Changing any of the option \ref rule() or course re-indexing the text
  723. /// invalidates existing iterators and they can't be used any more.
  724. /// - boundary_point_index can be created from segment_index or other boundary_point_index that was created with
  725. /// same \ref boundary_type. This is very fast operation %as they shared same index
  726. /// and it does not require its regeneration.
  727. ///
  728. /// \see
  729. ///
  730. /// - \ref segment_index
  731. /// - \ref boundary_point
  732. /// - \ref segment
  733. ///
  734. template<typename BaseIterator>
  735. class boundary_point_index {
  736. public:
  737. ///
  738. /// The type of the iterator used to iterate over the original text
  739. ///
  740. typedef BaseIterator base_iterator;
  741. #ifdef BOOST_LOCALE_DOXYGEN
  742. ///
  743. /// The bidirectional iterator that iterates over \ref value_type objects.
  744. ///
  745. /// - The iterators may be invalidated by use of any non-const member function
  746. /// including but not limited to \ref rule(rule_type) member function.
  747. /// - The returned value_type object is valid %as long %as iterator points to it.
  748. /// So this following code is wrong %as t used after p was updated:
  749. /// \code
  750. /// boundary_point_index<some_iterator>::iterator p=index.begin();
  751. /// boundary_point<some_iterator> &t = *p;
  752. /// ++p;
  753. /// rule_type r = t->rule();
  754. /// \endcode
  755. ///
  756. typedef unspecified_iterator_type iterator;
  757. ///
  758. /// \copydoc iterator
  759. ///
  760. typedef unspecified_iterator_type const_iterator;
  761. #else
  762. typedef details::boundary_point_index_iterator<base_iterator> iterator;
  763. typedef details::boundary_point_index_iterator<base_iterator> const_iterator;
  764. #endif
  765. ///
  766. /// The type dereferenced by the \ref iterator and \ref const_iterator. It is
  767. /// an object that represents the selected \ref boundary_point "boundary point".
  768. ///
  769. typedef boundary_point<base_iterator> value_type;
  770. ///
  771. /// Default constructor.
  772. ///
  773. /// \note
  774. ///
  775. /// When this object is constructed by default it does not include a valid index, thus
  776. /// calling \ref begin(), \ref end() or \ref find() member functions would lead to undefined
  777. /// behavior
  778. ///
  779. boundary_point_index() : mask_(0xFFFFFFFFu)
  780. {
  781. }
  782. ///
  783. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  784. /// in range [begin,end) using a rule \a mask for locale \a loc.
  785. ///
  786. boundary_point_index(boundary_type type,
  787. base_iterator begin,
  788. base_iterator end,
  789. rule_type mask,
  790. std::locale const &loc=std::locale())
  791. :
  792. map_(type,begin,end,loc),
  793. mask_(mask)
  794. {
  795. }
  796. ///
  797. /// Create a segment_index for %boundary analysis \ref boundary_type "type" of the text
  798. /// in range [begin,end) selecting all possible %boundary points (full mask) for locale \a loc.
  799. ///
  800. boundary_point_index(boundary_type type,
  801. base_iterator begin,
  802. base_iterator end,
  803. std::locale const &loc=std::locale())
  804. :
  805. map_(type,begin,end,loc),
  806. mask_(0xFFFFFFFFu)
  807. {
  808. }
  809. ///
  810. /// Create a boundary_point_index from a \ref segment_index. It copies all indexing information
  811. /// and uses the default rule (all possible %boundary points)
  812. ///
  813. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  814. /// range it is much better to create one from another rather then indexing the same
  815. /// range twice.
  816. ///
  817. /// \note \ref rule() flags are not copied
  818. ///
  819. boundary_point_index(segment_index<base_iterator> const &other);
  820. ///
  821. /// Copy a boundary_point_index from a \ref segment_index. It copies all indexing information
  822. /// and keeps the current \ref rule() unchanged
  823. ///
  824. /// This operation is very cheap, so if you use boundary_point_index and segment_index on same text
  825. /// range it is much better to create one from another rather then indexing the same
  826. /// range twice.
  827. ///
  828. /// \note \ref rule() flags are not copied
  829. ///
  830. boundary_point_index const &operator=(segment_index<base_iterator> const &other);
  831. ///
  832. /// Create a new index for %boundary analysis \ref boundary_type "type" of the text
  833. /// in range [begin,end) for locale \a loc.
  834. ///
  835. /// \note \ref rule() remains unchanged.
  836. ///
  837. void map(boundary_type type,base_iterator begin,base_iterator end,std::locale const &loc=std::locale())
  838. {
  839. map_ = mapping_type(type,begin,end,loc);
  840. }
  841. ///
  842. /// Get the \ref iterator on the beginning of the %boundary points range.
  843. ///
  844. /// Preconditions: this boundary_point_index should have a mapping
  845. ///
  846. /// \note
  847. ///
  848. /// The returned iterator is invalidated by access to any non-const member functions of this object
  849. ///
  850. iterator begin() const
  851. {
  852. return iterator(true,&map_,mask_);
  853. }
  854. ///
  855. /// Get the \ref iterator on the ending of the %boundary points range.
  856. ///
  857. /// Preconditions: this boundary_point_index should have a mapping
  858. ///
  859. /// \note
  860. ///
  861. /// The returned iterator is invalidated by access to any non-const member functions of this object
  862. ///
  863. iterator end() const
  864. {
  865. return iterator(false,&map_,mask_);
  866. }
  867. ///
  868. /// Find a first valid %boundary point on a position \a p or following it.
  869. ///
  870. /// For example: For \ref word %boundary analysis of the text "to be or"
  871. ///
  872. /// - "|to be", would return %boundary point at "|to be",
  873. /// - "t|o be", would point to "to| be"
  874. ///
  875. /// Preconditions: the boundary_point_index should have a mapping and \a p should be valid iterator
  876. /// to the text in the mapped range.
  877. ///
  878. /// The returned iterator is invalidated by access to any non-const member functions of this object
  879. ///
  880. iterator find(base_iterator p) const
  881. {
  882. return iterator(p,&map_,mask_);
  883. }
  884. ///
  885. /// Get the mask of rules that are used
  886. ///
  887. rule_type rule() const
  888. {
  889. return mask_;
  890. }
  891. ///
  892. /// Set the mask of rules that are used
  893. ///
  894. void rule(rule_type v)
  895. {
  896. mask_ = v;
  897. }
  898. private:
  899. friend class segment_index<base_iterator>;
  900. typedef details::mapping<base_iterator> mapping_type;
  901. mapping_type map_;
  902. rule_type mask_;
  903. };
  904. /// \cond INTERNAL
  905. template<typename BaseIterator>
  906. segment_index<BaseIterator>::segment_index(boundary_point_index<BaseIterator> const &other) :
  907. map_(other.map_),
  908. mask_(0xFFFFFFFFu),
  909. full_select_(false)
  910. {
  911. }
  912. template<typename BaseIterator>
  913. boundary_point_index<BaseIterator>::boundary_point_index(segment_index<BaseIterator> const &other) :
  914. map_(other.map_),
  915. mask_(0xFFFFFFFFu)
  916. {
  917. }
  918. template<typename BaseIterator>
  919. segment_index<BaseIterator> const &segment_index<BaseIterator>::operator=(boundary_point_index<BaseIterator> const &other)
  920. {
  921. map_ = other.map_;
  922. return *this;
  923. }
  924. template<typename BaseIterator>
  925. boundary_point_index<BaseIterator> const &boundary_point_index<BaseIterator>::operator=(segment_index<BaseIterator> const &other)
  926. {
  927. map_ = other.map_;
  928. return *this;
  929. }
  930. /// \endcond
  931. typedef segment_index<std::string::const_iterator> ssegment_index; ///< convenience typedef
  932. typedef segment_index<std::wstring::const_iterator> wssegment_index; ///< convenience typedef
  933. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  934. typedef segment_index<std::u16string::const_iterator> u16ssegment_index;///< convenience typedef
  935. #endif
  936. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  937. typedef segment_index<std::u32string::const_iterator> u32ssegment_index;///< convenience typedef
  938. #endif
  939. typedef segment_index<char const *> csegment_index; ///< convenience typedef
  940. typedef segment_index<wchar_t const *> wcsegment_index; ///< convenience typedef
  941. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  942. typedef segment_index<char16_t const *> u16csegment_index; ///< convenience typedef
  943. #endif
  944. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  945. typedef segment_index<char32_t const *> u32csegment_index; ///< convenience typedef
  946. #endif
  947. typedef boundary_point_index<std::string::const_iterator> sboundary_point_index;///< convenience typedef
  948. typedef boundary_point_index<std::wstring::const_iterator> wsboundary_point_index;///< convenience typedef
  949. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  950. typedef boundary_point_index<std::u16string::const_iterator> u16sboundary_point_index;///< convenience typedef
  951. #endif
  952. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  953. typedef boundary_point_index<std::u32string::const_iterator> u32sboundary_point_index;///< convenience typedef
  954. #endif
  955. typedef boundary_point_index<char const *> cboundary_point_index; ///< convenience typedef
  956. typedef boundary_point_index<wchar_t const *> wcboundary_point_index; ///< convenience typedef
  957. #ifdef BOOST_LOCALE_ENABLE_CHAR16_T
  958. typedef boundary_point_index<char16_t const *> u16cboundary_point_index;///< convenience typedef
  959. #endif
  960. #ifdef BOOST_LOCALE_ENABLE_CHAR32_T
  961. typedef boundary_point_index<char32_t const *> u32cboundary_point_index;///< convenience typedef
  962. #endif
  963. } // boundary
  964. } // locale
  965. } // boost
  966. ///
  967. /// \example boundary.cpp
  968. /// Example of using segment_index
  969. /// \example wboundary.cpp
  970. /// Example of using segment_index over wide strings
  971. ///
  972. #ifdef BOOST_MSVC
  973. #pragma warning(pop)
  974. #endif
  975. #endif
  976. // vim: tabstop=4 expandtab shiftwidth=4 softtabstop=4