boundary_analysys.html 34 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371
  1. <!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
  2. <html xmlns="http://www.w3.org/1999/xhtml">
  3. <head>
  4. <meta http-equiv="Content-Type" content="text/xhtml;charset=UTF-8"/>
  5. <meta http-equiv="X-UA-Compatible" content="IE=9"/>
  6. <meta name="generator" content="Doxygen 1.8.6"/>
  7. <title>Boost.Locale: Boundary analysis</title>
  8. <link href="tabs.css" rel="stylesheet" type="text/css"/>
  9. <script type="text/javascript" src="jquery.js"></script>
  10. <script type="text/javascript" src="dynsections.js"></script>
  11. <link href="navtree.css" rel="stylesheet" type="text/css"/>
  12. <script type="text/javascript" src="resize.js"></script>
  13. <script type="text/javascript" src="navtree.js"></script>
  14. <script type="text/javascript">
  15. $(document).ready(initResizable);
  16. $(window).load(resizeHeight);
  17. </script>
  18. <link href="doxygen.css" rel="stylesheet" type="text/css" />
  19. </head>
  20. <body>
  21. <div id="top"><!-- do not remove this div, it is closed by doxygen! -->
  22. <div id="titlearea">
  23. <table cellspacing="0" cellpadding="0">
  24. <tbody>
  25. <tr style="height: 56px;">
  26. <td id="projectlogo"><img alt="Logo" src="boost-small.png"/></td>
  27. <td style="padding-left: 0.5em;">
  28. <div id="projectname">Boost.Locale
  29. </div>
  30. </td>
  31. </tr>
  32. </tbody>
  33. </table>
  34. </div>
  35. <!-- end header part -->
  36. <!-- Generated by Doxygen 1.8.6 -->
  37. <div id="navrow1" class="tabs">
  38. <ul class="tablist">
  39. <li><a href="index.html"><span>Main&#160;Page</span></a></li>
  40. <li class="current"><a href="pages.html"><span>Related&#160;Pages</span></a></li>
  41. <li><a href="modules.html"><span>Modules</span></a></li>
  42. <li><a href="namespaces.html"><span>Namespaces</span></a></li>
  43. <li><a href="annotated.html"><span>Classes</span></a></li>
  44. <li><a href="files.html"><span>Files</span></a></li>
  45. <li><a href="examples.html"><span>Examples</span></a></li>
  46. </ul>
  47. </div>
  48. </div><!-- top -->
  49. <div id="side-nav" class="ui-resizable side-nav-resizable">
  50. <div id="nav-tree">
  51. <div id="nav-tree-contents">
  52. <div id="nav-sync" class="sync"></div>
  53. </div>
  54. </div>
  55. <div id="splitbar" style="-moz-user-select:none;"
  56. class="ui-resizable-handle">
  57. </div>
  58. </div>
  59. <script type="text/javascript">
  60. $(document).ready(function(){initNavTree('boundary_analysys.html','');});
  61. </script>
  62. <div id="doc-content">
  63. <div class="header">
  64. <div class="headertitle">
  65. <div class="title">Boundary analysis </div> </div>
  66. </div><!--header-->
  67. <div class="contents">
  68. <div class="textblock"><ul>
  69. <li><a class="el" href="boundary_analysys.html#boundary_analysys_basics">Basics</a></li>
  70. <li><a class="el" href="boundary_analysys.html#boundary_analysys_segments">Iterating Over Segments</a><ul>
  71. <li><a class="el" href="boundary_analysys.html#boundary_analysys_segments_basics">Basic Iteration</a></li>
  72. <li><a class="el" href="boundary_analysys.html#boundary_analysys_segments_rules">Using Rules</a></li>
  73. <li><a class="el" href="boundary_analysys.html#boundary_analysys_segments_search">Locating Segments</a></li>
  74. </ul>
  75. </li>
  76. <li><a class="el" href="boundary_analysys.html#boundary_analysys_break">Iterating Over Boundary Points</a><ul>
  77. <li><a class="el" href="boundary_analysys.html#boundary_analysys_break_basics">Basic Iteration</a></li>
  78. <li><a class="el" href="boundary_analysys.html#boundary_analysys_break_rules">Using Rules</a></li>
  79. <li><a class="el" href="boundary_analysys.html#boundary_analysys_break_search">Locating Boundary Points</a></li>
  80. </ul>
  81. </li>
  82. </ul>
  83. <h1><a class="anchor" id="boundary_analysys_basics"></a>
  84. Basics</h1>
  85. <p>Boost.Locale provides a boundary analysis tool, allowing you to split text into characters, words, or sentences, and find appropriate places for line breaks.</p>
  86. <dl class="section note"><dt>Note</dt><dd>This task is not a trivial task. </dd></dl>
  87. <dl class="section user"><dt></dt><dd>A Unicode code point and a character are not equivalent, for example: Hebrew word Shalom - "שָלוֹם" that consists of 4 characters and 6 code points (4 base letters and 2 diacritical marks) </dd></dl>
  88. <dl class="section user"><dt></dt><dd>Words may not be separated by space characters in some languages like in Japanese or Chinese.</dd></dl>
  89. <p>Boost.Locale provides 2 major classes for boundary analysis:</p>
  90. <ul>
  91. <li><a class="el" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">boost::locale::boundary::segment_index</a> - an object that holds an index of segments in the text (like words, characters, sentences). It provides an access to <a class="el" href="classboost_1_1locale_1_1boundary_1_1segment.html">segment</a> objects via iterators.</li>
  92. <li><a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">boost::locale::boundary::boundary_point_index</a> - an object that holds an index of boundary points in the text. It allows to iterate over the <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point.html">boundary_point</a> objects.</li>
  93. </ul>
  94. <p>Each of the classes above use an iterator type as template parameter. Both of these classes accept in their constructor:</p>
  95. <ul>
  96. <li>A flag that defines boundary analysis <a class="el" href="group__boundary.html#ga15de9963ce9bb6037c8525901dfbf641">boundary_type</a>.</li>
  97. <li>The pair of iterators that define the text range that should be analysed</li>
  98. <li>A locale parameter (if not given the global one is used)</li>
  99. </ul>
  100. <p>For example: </p>
  101. <div class="fragment"><div class="line"><span class="keyword">namespace </span>ba=boost::locale::boundary;</div>
  102. <div class="line">std::string text= ... ;</div>
  103. <div class="line">std::locale loc = ... ;</div>
  104. <div class="line">ba::segment_index&lt;std::string::const_iterator&gt; map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">ba::word</a>,text.begin(),text.end(),loc);</div>
  105. </div><!-- fragment --><p>Each of them provide a members <code>begin()</code>, <code>end()</code> and <code>find()</code> that allow to iterate over the selected segments or boundaries in the text or find a location of a segment or boundary for given iterator.</p>
  106. <p>Convenience a typedefs like <a class="el" href="group__boundary.html#gad4785439a3f03ee455c93830b8f1366c">ssegment_index</a> or <a class="el" href="group__boundary.html#gabcb5bcc788909afeb8c68d857284cb59">wcboundary_point_index</a> provided as well, where "w", "u16" and "u32" prefixes define a character type <code>wchar_t</code>, <code>char16_t</code> and <code>char32_t</code> and "c" and "s" prefixes define whether <code>std::basic_string&lt;CharType&gt;::const_iterator</code> or <code>CharType const *</code> are used.</p>
  107. <h1><a class="anchor" id="boundary_analysys_segments"></a>
  108. Iterating Over Segments</h1>
  109. <h1><a class="anchor" id="boundary_analysys_segments_basics"></a>
  110. Basic Iteration</h1>
  111. <p>The text segments analysis is done using <a class="el" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">segment_index</a> class.</p>
  112. <p>It provides a bidirectional iterator that returns <a class="el" href="classboost_1_1locale_1_1boundary_1_1segment.html">segment</a> object. The segment object represents a pair of iterators that define this segment and a rule according to which it was selected. It can be automatically converted to <code>std::basic_string</code> object.</p>
  113. <p>To perform boundary analysis, we first create an index object and then iterate over it:</p>
  114. <p>For example:</p>
  115. <div class="fragment"><div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  116. <div class="line"><a class="code" href="classboost_1_1locale_1_1generator.html">boost::locale::generator</a> gen;</div>
  117. <div class="line">std::string text=<span class="stringliteral">&quot;To be or not to be, that is the question.&quot;</span></div>
  118. <div class="line"><span class="comment">// Create mapping of text for token iterator using global locale.</span></div>
  119. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">ssegment_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">word</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;en_US.UTF-8&quot;</span>)); </div>
  120. <div class="line"><span class="comment">// Print all &quot;words&quot; -- chunks of word boundary</span></div>
  121. <div class="line"><span class="keywordflow">for</span>(<a class="code" href="group__boundary.html#gaf7a775e77dbbca3495e11d646df96fd2">ssegment_index::iterator</a> it=map.begin(),e=map.end();it!=e;++it)</div>
  122. <div class="line"> std::cout &lt;&lt;<span class="stringliteral">&quot;\&quot;&quot;</span>&lt;&lt; * it &lt;&lt; <span class="stringliteral">&quot;\&quot;, &quot;</span>;</div>
  123. <div class="line">std::cout &lt;&lt; std::endl;</div>
  124. </div><!-- fragment --><p>Would print:</p>
  125. <pre class="fragment">"To", " ", "be", " ", "or", " ", "not", " ", "to", " ", "be", ",", " ", "that", " ", "is", " ", "the", " ", "question", ".",
  126. </pre><p>This sentence "生きるか死ぬか、それが問題だ。" (<a href="http://tatoeba.org/eng/sentences/show/868189">from Tatoeba database</a>) would be split into following segments in <code>ja_JP.UTF-8</code> (Japanese) locale:</p>
  127. <pre class="fragment">"生", "きるか", "死", "ぬか", "、", "それが", "問題", "だ", "。",
  128. </pre><p>The boundary analysis that is done by Boost.Locale is much more complicated then just splitting the text according to white space characters, even thou it is not perfect.</p>
  129. <h1><a class="anchor" id="boundary_analysys_segments_rules"></a>
  130. Using Rules</h1>
  131. <p>The segments selection can be customized using <a class="el" href="group__boundary.html#gad19735180401edb15acbdbbeb21e5a73">rule()</a> and <a class="el" href="group__boundary.html#ga205fd51daa439a18527675e663a0802f">full_select()</a> member functions.</p>
  132. <p>By default segment_index's iterator return each text segment defined by two boundary points regardless the way they were selected. Thus in the example above we could see text segments like "." or " " that were selected as words.</p>
  133. <p>Using a <code>rule()</code> member function we can specify a binary mask of rules we want to use for selection of the boundary points using <a class="el" href="group__boundary.html#bl_boundary_word_rules">word</a>, <a class="el" href="group__boundary.html#bl_boundary_line_rules">line</a> and <a class="el" href="group__boundary.html#bl_boundary_sentence_rules">sentence</a> boundary rules.</p>
  134. <p>For example, by calling</p>
  135. <div class="fragment"><div class="line">map.rule(<a class="code" href="group__boundary.html#ga3ab98808dbb1cc4a346dcc2554c9d8dc">word_any</a>);</div>
  136. </div><!-- fragment --><p>Before starting the iteration process, specify a selection mask that fetches: numbers, letter, Kana letters and ideographic characters ignoring all non-word related characters like white space or punctuation marks.</p>
  137. <p>So the code:</p>
  138. <div class="fragment"><div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  139. <div class="line">std::string text=<span class="stringliteral">&quot;To be or not to be, that is the question.&quot;</span></div>
  140. <div class="line"><span class="comment">// Create mapping of text for token iterator using global locale.</span></div>
  141. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">ssegment_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">word</a>,text.begin(),text.end()); </div>
  142. <div class="line"><span class="comment">// Define a rule</span></div>
  143. <div class="line">map.<a class="code" href="group__boundary.html#ga72b4ceb5bacec0eded2601c43a4d671a">rule</a>(<a class="code" href="group__boundary.html#ga3ab98808dbb1cc4a346dcc2554c9d8dc">word_any</a>);</div>
  144. <div class="line"><span class="comment">// Print all &quot;words&quot; -- chunks of word boundary</span></div>
  145. <div class="line"><span class="keywordflow">for</span>(<a class="code" href="group__boundary.html#gaf7a775e77dbbca3495e11d646df96fd2">ssegment_index::iterator</a> it=map.begin(),e=map.end();it!=e;++it)</div>
  146. <div class="line"> std::cout &lt;&lt;<span class="stringliteral">&quot;\&quot;&quot;</span>&lt;&lt; * it &lt;&lt; <span class="stringliteral">&quot;\&quot;, &quot;</span>;</div>
  147. <div class="line">std::cout &lt;&lt; std::endl;</div>
  148. </div><!-- fragment --><p>Would print:</p>
  149. <pre class="fragment">"To", "be", "or", "not", "to", "be", "that", "is", "the", "question",
  150. </pre><p>And the for given text="生きるか死ぬか、それが問題だ。" and rule(<a class="el" href="group__boundary.html#ga705ab96f9e62810c8ed977c90d404ef8">word_ideo</a>), the example above would print.</p>
  151. <pre class="fragment">"生", "死", "問題",
  152. </pre><p>You can access specific rules the segments where selected it using <a class="el" href="classboost_1_1locale_1_1boundary_1_1segment.html#a5b36a522d7013306617dbcccc9919343">segment::rule()</a> member function. Using a bit-mask of rules.</p>
  153. <p>For example:</p>
  154. <div class="fragment"><div class="line"><a class="code" href="classboost_1_1locale_1_1generator.html">boost::locale::generator</a> gen;</div>
  155. <div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  156. <div class="line">std::string text=<span class="stringliteral">&quot;生きるか死ぬか、それが問題だ。&quot;</span>;</div>
  157. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">ssegment_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">word</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;ja_JP.UTF-8&quot;</span>)); </div>
  158. <div class="line"><span class="keywordflow">for</span>(<a class="code" href="group__boundary.html#gaf7a775e77dbbca3495e11d646df96fd2">ssegment_index::iterator</a> it=map.begin(),e=map.end();it!=e;++it) {</div>
  159. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;Segment &quot;</span> &lt;&lt; *it &lt;&lt; <span class="stringliteral">&quot; contains: &quot;</span>;</div>
  160. <div class="line"> <span class="keywordflow">if</span>(it-&gt;rule() &amp; <a class="code" href="group__boundary.html#ga90cf4f01b95658f659685377226677e7">word_none</a>)</div>
  161. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;white space or punctuation marks &quot;</span>;</div>
  162. <div class="line"> <span class="keywordflow">if</span>(it-&gt;rule() &amp; <a class="code" href="group__boundary.html#ga52d8c63e1f3f8c898c645352206a78ef">word_kana</a>)</div>
  163. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;kana characters &quot;</span>;</div>
  164. <div class="line"> <span class="keywordflow">if</span>(it-&gt;rule() &amp; <a class="code" href="group__boundary.html#ga705ab96f9e62810c8ed977c90d404ef8">word_ideo</a>)</div>
  165. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;ideographic characters&quot;</span>;</div>
  166. <div class="line"> std::cout&lt;&lt; std::endl;</div>
  167. <div class="line">}</div>
  168. </div><!-- fragment --><p>Would print</p>
  169. <pre class="fragment">Segment 生 contains: ideographic characters
  170. Segment きるか contains: kana characters
  171. Segment 死 contains: ideographic characters
  172. Segment ぬか contains: kana characters
  173. Segment 、 contains: white space or punctuation marks
  174. Segment それが contains: kana characters
  175. Segment 問題 contains: ideographic characters
  176. Segment だ contains: kana characters
  177. Segment 。 contains: white space or punctuation marks
  178. </pre><p>One important things that should be noted that each segment is defined by a pair of boundaries and the rule of its ending point defines if it is selected or not.</p>
  179. <p>In some cases it may be not what we actually look like.</p>
  180. <p>For example we have a text:</p>
  181. <pre class="fragment">Hello! How
  182. are you?
  183. </pre><p>And we want to fetch all sentences from the text.</p>
  184. <p>The <a class="el" href="group__boundary.html#bl_boundary_sentence_rules">sentence rules</a> have two options:</p>
  185. <ul>
  186. <li>Split the text on the point where sentence terminator like ".!?" detected: <a class="el" href="group__boundary.html#ga3befefe67f79691c117bf5588741355b">sentence_term</a></li>
  187. <li>Split the text on the point where sentence separator like "line feed" detected: <a class="el" href="group__boundary.html#gaf67883341dd3d8f786e7281d40790000">sentence_sep</a></li>
  188. </ul>
  189. <p>Naturally to ignore sentence separators we would call <a class="el" href="group__boundary.html#gad19735180401edb15acbdbbeb21e5a73">segment_index::rule(rule_type v)</a> with sentence_term parameter and then run the iterator.</p>
  190. <div class="fragment"><div class="line"><a class="code" href="classboost_1_1locale_1_1generator.html">boost::locale::generator</a> gen;</div>
  191. <div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  192. <div class="line">std::string text= <span class="stringliteral">&quot;Hello! How\n&quot;</span></div>
  193. <div class="line"> <span class="stringliteral">&quot;are you?\n&quot;</span>;</div>
  194. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">ssegment_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a88aa1509eace7589f5df87d4694871e9">sentence</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;en_US.UTF-8&quot;</span>)); </div>
  195. <div class="line">map.<a class="code" href="group__boundary.html#ga72b4ceb5bacec0eded2601c43a4d671a">rule</a>(<a class="code" href="group__boundary.html#ga3befefe67f79691c117bf5588741355b">sentence_term</a>);</div>
  196. <div class="line"><span class="keywordflow">for</span>(<a class="code" href="group__boundary.html#gaf7a775e77dbbca3495e11d646df96fd2">ssegment_index::iterator</a> it=map.begin(),e=map.end();it!=e;++it) </div>
  197. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;Sentence [&quot;</span> &lt;&lt; *it &lt;&lt; <span class="stringliteral">&quot;]&quot;</span> &lt;&lt; std::endl;</div>
  198. </div><!-- fragment --><p>However we would get the expected segments: </p>
  199. <pre class="fragment">Sentence [Hello! ]
  200. Sentence [are you?
  201. ]
  202. </pre><p>The reason is that "How\n" is still considered a sentence but selected by different rule.</p>
  203. <p>This behavior can be changed by setting <a class="el" href="group__boundary.html#ga205fd51daa439a18527675e663a0802f">segment_index::full_select(bool)</a> to <code>true</code>. It would force iterator to join the current segment with all previous segments that may not fit the required rule.</p>
  204. <p>So we add this line:</p>
  205. <div class="fragment"><div class="line">map.full_select(<span class="keyword">true</span>);</div>
  206. </div><!-- fragment --><p>Right after "map.rule(sentence_term);" and get expected output:</p>
  207. <pre class="fragment">Sentence [Hello! ]
  208. Sentence [How
  209. are you?
  210. ]
  211. </pre><h2><a class="anchor" id="boundary_analysys_segments_search"></a>
  212. Locating Segments</h2>
  213. <p>Sometimes it is useful to find a segment that some specific iterator is pointing on.</p>
  214. <p>For example a user had clicked at specific point, we want to select a word on this location.</p>
  215. <p><a class="el" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">segment_index</a> provides <a class="el" href="group__boundary.html#ga2480236106971797460187777f2a4411">find(base_iterator p)</a> member function for this purpose.</p>
  216. <p>This function returns the iterator to the segmet such that <em>p</em> points to.</p>
  217. <p>For example:</p>
  218. <div class="fragment"><div class="line">text=<span class="stringliteral">&quot;to be or &quot;</span>;</div>
  219. <div class="line"><a class="code" href="group__boundary.html#gad4785439a3f03ee455c93830b8f1366c">ssegment_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">word</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;en_US.UTF-8&quot;</span>));</div>
  220. <div class="line">ssegment_index::iterator p = map.find(text.begin() + 4);</div>
  221. <div class="line"><span class="keywordflow">if</span>(p!=map.end())</div>
  222. <div class="line"> std::cout &lt;&lt; *p &lt;&lt; std::endl;</div>
  223. </div><!-- fragment --><p>Would print:</p>
  224. <pre class="fragment">be
  225. </pre><dl class="section note"><dt>Note</dt><dd></dd></dl>
  226. <p>if the iterator lays inside the segment this segment returned. If the segment does not fit the selection rules, then the segment following requested position is returned.</p>
  227. <p>For example: For <a class="el" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">word</a> boundary analysis with <a class="el" href="group__boundary.html#ga3ab98808dbb1cc4a346dcc2554c9d8dc">word_any</a> rule:</p>
  228. <ul>
  229. <li>"t|o be or ", would point to "to" - the iterator in the middle of segment "to".</li>
  230. <li>"to |be or ", would point to "be" - the iterator at the beginning of the segment "be"</li>
  231. <li>"to| be or ", would point to "be" - the iterator does is not point to segment with required rule so next valid segment is selected "be".</li>
  232. <li>"to be or| ", would point to end as not valid segment found.</li>
  233. </ul>
  234. <h1><a class="anchor" id="boundary_analysys_break"></a>
  235. Iterating Over Boundary Points</h1>
  236. <h1><a class="anchor" id="boundary_analysys_break_basics"></a>
  237. Basic Iteration</h1>
  238. <p>The <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">boundary_point_index</a> is similar to <a class="el" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">segment_index</a> in its interface but as a different role. Instead of returning text chunks (<a class="el" href="classboost_1_1locale_1_1boundary_1_1segment.html">segment</a>s, it returns <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point.html">boundary_point</a> object that represents a position in text - a base iterator used that is used for iteration of the source text C++ characters. The <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point.html">boundary_point</a> object also provides a <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point.html#a757b28e11c66f9871d3f51fe93a51bdb">rule()</a> member function that defines a rule this boundary was selected according to.</p>
  239. <dl class="section note"><dt>Note</dt><dd>The beginning and the ending of the text are considered boundary points, so even an empty text consists of at least one boundary point.</dd></dl>
  240. <p>Lets see an example of selecting first two sentences from a text:</p>
  241. <div class="fragment"><div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  242. <div class="line"><a class="code" href="classboost_1_1locale_1_1generator.html">boost::locale::generator</a> gen;</div>
  243. <div class="line"></div>
  244. <div class="line"><span class="comment">// our text sample</span></div>
  245. <div class="line">std::string <span class="keyword">const</span> text=<span class="stringliteral">&quot;First sentence. Second sentence! Third one?&quot;</span>;</div>
  246. <div class="line"><span class="comment">// Create an index </span></div>
  247. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">sboundary_point_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a88aa1509eace7589f5df87d4694871e9">sentence</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;en_US.UTF-8&quot;</span>));</div>
  248. <div class="line"></div>
  249. <div class="line"><span class="comment">// Count two boundary points</span></div>
  250. <div class="line"><a class="code" href="group__boundary.html#ga1af6e72b3c384edcebc0cf319fe97efe">sboundary_point_index::iterator</a> p = map.<a class="code" href="group__boundary.html#ga56f42a32f0378b6e157671f9e17bd66f">begin</a>(),e=map.end();</div>
  251. <div class="line"><span class="keywordtype">int</span> count = 0;</div>
  252. <div class="line"><span class="keywordflow">while</span>(p!=e &amp;&amp; count &lt; 2) {</div>
  253. <div class="line"> ++count;</div>
  254. <div class="line"> ++p;</div>
  255. <div class="line">}</div>
  256. <div class="line"></div>
  257. <div class="line"><span class="keywordflow">if</span>(p!=e) {</div>
  258. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;First two sentences are: &quot;</span> </div>
  259. <div class="line"> &lt;&lt; std::string(text.begin(),p-&gt;iterator()) </div>
  260. <div class="line"> &lt;&lt; std::endl;</div>
  261. <div class="line">}</div>
  262. <div class="line"><span class="keywordflow">else</span> {</div>
  263. <div class="line"> std::cout &lt;&lt;<span class="stringliteral">&quot;There are less then two sentences in this &quot;</span></div>
  264. <div class="line"> &lt;&lt;<span class="stringliteral">&quot;text: &quot;</span> &lt;&lt; text &lt;&lt; std::endl;</div>
  265. <div class="line">}</div>
  266. </div><!-- fragment --><p>Would print:</p>
  267. <pre class="fragment">First two sentences are: First sentence. Second sentence!
  268. </pre><h1><a class="anchor" id="boundary_analysys_break_rules"></a>
  269. Using Rules</h1>
  270. <p>Similarly to the <a class="el" href="classboost_1_1locale_1_1boundary_1_1segment__index.html">segment_index</a> the <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">boundary_point_index</a> provides a <a class="el" href="group__boundary.html#ga56e63913f51109e05a24a7136472a975">rule(rule_type mask)</a> member function to filter boundary points that interest us.</p>
  271. <p>It allows to set <a class="el" href="group__boundary.html#bl_boundary_word_rules">word</a>, <a class="el" href="group__boundary.html#bl_boundary_line_rules">line</a> and <a class="el" href="group__boundary.html#bl_boundary_sentence_rules">sentence</a> rules for filtering boundary points.</p>
  272. <p>Lets change an example above a little:</p>
  273. <div class="fragment"><div class="line"><span class="comment">// our text sample</span></div>
  274. <div class="line">std::string <span class="keyword">const</span> text= <span class="stringliteral">&quot;First sentence. Second\n&quot;</span></div>
  275. <div class="line"> <span class="stringliteral">&quot;sentence! Third one?&quot;</span>;</div>
  276. </div><!-- fragment --><p>If we run our program as is on the sample above we would get: </p>
  277. <pre class="fragment">First two sentences are: First sentence. Second
  278. </pre><p>Which is not something that we really expected. As the "Second\n" is considered an independent sentence that was separated by a line separator "Line Feed".</p>
  279. <p>However, we can set set a rule <a class="el" href="group__boundary.html#ga3befefe67f79691c117bf5588741355b">sentence_term</a> and the iterator would use only boundary points that are created by a sentence terminators like ".!?".</p>
  280. <p>So by adding: </p>
  281. <div class="fragment"><div class="line">map.rule(<a class="code" href="group__boundary.html#ga3befefe67f79691c117bf5588741355b">sentence_term</a>);</div>
  282. </div><!-- fragment --><p>Right after the generation of the index we would get the desired output:</p>
  283. <pre class="fragment">First two sentences are: First sentence. Second
  284. sentence!
  285. </pre><p>You can also use <a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point.html#a757b28e11c66f9871d3f51fe93a51bdb">boundary_point::rule()</a> member function to learn about the reason this boundary point was created by comparing it with an appropriate mask.</p>
  286. <p>For example:</p>
  287. <div class="fragment"><div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  288. <div class="line"><a class="code" href="classboost_1_1locale_1_1generator.html">boost::locale::generator</a> gen;</div>
  289. <div class="line"><span class="comment">// our text sample</span></div>
  290. <div class="line">std::string <span class="keyword">const</span> text= <span class="stringliteral">&quot;First sentence. Second\n&quot;</span></div>
  291. <div class="line"> <span class="stringliteral">&quot;sentence! Third one?&quot;</span>;</div>
  292. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">sboundary_point_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a88aa1509eace7589f5df87d4694871e9">sentence</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;en_US.UTF-8&quot;</span>));</div>
  293. <div class="line"></div>
  294. <div class="line"><span class="keywordflow">for</span>(<a class="code" href="group__boundary.html#ga1af6e72b3c384edcebc0cf319fe97efe">sboundary_point_index::iterator</a> p = map.begin(),e=map.end();p!=e;++p) {</div>
  295. <div class="line"> <span class="keywordflow">if</span>(p-&gt;rule() &amp; <a class="code" href="group__boundary.html#ga3befefe67f79691c117bf5588741355b">sentence_term</a>)</div>
  296. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;There is a sentence terminator: &quot;</span>;</div>
  297. <div class="line"> <span class="keywordflow">else</span> <span class="keywordflow">if</span>(p-&gt;rule() &amp; <a class="code" href="group__boundary.html#gaf67883341dd3d8f786e7281d40790000">sentence_sep</a>)</div>
  298. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;There is a sentence separator: &quot;</span>;</div>
  299. <div class="line"> <span class="keywordflow">if</span>(p-&gt;rule()!=0) <span class="comment">// print if some rule exists</span></div>
  300. <div class="line"> std::cout &lt;&lt; <span class="stringliteral">&quot;[&quot;</span> &lt;&lt; std::string(text.begin(),p-&gt;iterator()) </div>
  301. <div class="line"> &lt;&lt; <span class="stringliteral">&quot;|&quot;</span> &lt;&lt; std::string(p-&gt;iterator(),text.end()) </div>
  302. <div class="line"> &lt;&lt; <span class="stringliteral">&quot;]\n&quot;</span>;</div>
  303. <div class="line">}</div>
  304. </div><!-- fragment --><p>Would give the following output: </p>
  305. <pre class="fragment">There is a sentence terminator: [First sentence. |Second
  306. sentence! Third one?]
  307. There is a sentence separator: [First sentence. Second
  308. |sentence! Third one?]
  309. There is a sentence terminator: [First sentence. Second
  310. sentence! |Third one?]
  311. There is a sentence terminator: [First sentence. Second
  312. sentence! Third one?|]
  313. </pre><h2><a class="anchor" id="boundary_analysys_break_search"></a>
  314. Locating Boundary Points</h2>
  315. <p>Sometimes it is useful to find a specific boundary point according to given iterator.</p>
  316. <p><a class="el" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">boundary_point_index</a> provides a <a class="el" href="group__boundary.html#ga0bb71a287afca990e85b17246568492d">iterator find(base_iterator p)</a> member function.</p>
  317. <p>It would return an iterator to a boundary point on <em>p's</em> location or at the location following it if <em>p</em> does not point to appropriate position.</p>
  318. <p>For example, for word boundary analysis:</p>
  319. <ul>
  320. <li>If a base iterator points to "to |be", then the returned boundary point would be "to |be" (same position)</li>
  321. <li>If a base iterator points to "t|o be", then the returned boundary point would be "to| be" (next valid position)</li>
  322. </ul>
  323. <p>For example if we want to select 6 words around specific boundary point we can use following code:</p>
  324. <div class="fragment"><div class="line"><span class="keyword">using namespace </span>boost::locale::boundary;</div>
  325. <div class="line"><a class="code" href="classboost_1_1locale_1_1generator.html">boost::locale::generator</a> gen;</div>
  326. <div class="line"><span class="comment">// our text sample</span></div>
  327. <div class="line">std::string <span class="keyword">const</span> text= <span class="stringliteral">&quot;To be or not to be, that is the question.&quot;</span>;</div>
  328. <div class="line"></div>
  329. <div class="line"><span class="comment">// Create a mapping</span></div>
  330. <div class="line"><a class="code" href="classboost_1_1locale_1_1boundary_1_1boundary__point__index.html">sboundary_point_index</a> map(<a class="code" href="group__boundary.html#gga15de9963ce9bb6037c8525901dfbf641a99aad8b8a5e25baa9f695abe5e574bb6">word</a>,text.begin(),text.end(),gen(<span class="stringliteral">&quot;en_US.UTF-8&quot;</span>));</div>
  331. <div class="line"><span class="comment">// Ignore wite space</span></div>
  332. <div class="line">map.<a class="code" href="group__boundary.html#ga1d214029f1a780b7bf6e3f23a3004c03">rule</a>(<a class="code" href="group__boundary.html#ga3ab98808dbb1cc4a346dcc2554c9d8dc">word_any</a>);</div>
  333. <div class="line"></div>
  334. <div class="line"><span class="comment">// define our arbitraty point</span></div>
  335. <div class="line">std::string::const_iterator pos = text.begin() + 12; <span class="comment">// &quot;no|t&quot;;</span></div>
  336. <div class="line"></div>
  337. <div class="line"><span class="comment">// Get the search range</span></div>
  338. <div class="line"><a class="code" href="group__boundary.html#ga1af6e72b3c384edcebc0cf319fe97efe">sboundary_point_index::iterator</a> </div>
  339. <div class="line"> begin =map.begin(),</div>
  340. <div class="line"> end = map.end(),</div>
  341. <div class="line"> it = map.find(pos); <span class="comment">// find a boundary</span></div>
  342. <div class="line"></div>
  343. <div class="line"><span class="comment">// go 3 words backward</span></div>
  344. <div class="line"><span class="keywordflow">for</span>(<span class="keywordtype">int</span> count = 0;count &lt;3 &amp;&amp; it!=begin; count ++) </div>
  345. <div class="line"> --it;</div>
  346. <div class="line"></div>
  347. <div class="line"><span class="comment">// Save the start</span></div>
  348. <div class="line">std::string::const_iterator start = *it;</div>
  349. <div class="line"></div>
  350. <div class="line"><span class="comment">// go 6 words forward</span></div>
  351. <div class="line"><span class="keywordflow">for</span>(<span class="keywordtype">int</span> count = 0;count &lt; 6 &amp;&amp; it!=end; count ++)</div>
  352. <div class="line"> ++it;</div>
  353. <div class="line"></div>
  354. <div class="line"><span class="comment">// make sure we at valid position</span></div>
  355. <div class="line"><span class="keywordflow">if</span>(it==end)</div>
  356. <div class="line"> --it;</div>
  357. <div class="line"></div>
  358. <div class="line"><span class="comment">// print the text</span></div>
  359. <div class="line">std::cout &lt;&lt; std::string(start,it-&gt;iterator()) &lt;&lt; std::endl;</div>
  360. </div><!-- fragment --><p>That would print:</p>
  361. <pre class="fragment"> be or not to be, that
  362. </pre> </div></div><!-- contents -->
  363. </div><!-- doc-content -->
  364. <li class="footer">
  365. &copy; Copyright 2009-2012 Artyom Beilis, Distributed under the <a href="http://www.boost.org/LICENSE_1_0.txt">Boost Software License</a>, Version 1.0.
  366. </li>
  367. </ul>
  368. </div>
  369. </body>
  370. </html>