Boost.Sort
string_sort.hpp
Go to the documentation of this file.
1 // Details for a templated general-case hybrid-radix string_sort.
2 
3 // Copyright Steven J. Ross 2001 - 2014.
4 // Distributed under the Boost Software License, Version 1.0.
5 // (See accompanying file LICENSE_1_0.txt or copy at
6 // http://www.boost.org/LICENSE_1_0.txt)
7 
8 // See http://www.boost.org/libs/sort for library home page.
9 
10 /*
11 Some improvements suggested by:
12 Phil Endecott and Frank Gennari
13 */
14 
15 #ifndef BOOST_SORT_SPREADSORT_DETAIL_SPREAD_SORT_HPP
16 #define BOOST_SORT_SPREADSORT_DETAIL_SPREAD_SORT_HPP
17 #include <algorithm>
18 #include <vector>
19 #include <cstring>
20 #include <limits>
21 #include <functional>
22 #include <boost/static_assert.hpp>
23 #include <boost/serialization/static_warning.hpp>
24 #include <boost/utility/enable_if.hpp>
27 #include <boost/cstdint.hpp>
28 
29 namespace boost {
30 namespace sort {
31  namespace detail {
32  static const int max_step_size = 64;
33 
34  //Offsetting on identical characters. This function works a chunk of
35  //characters at a time for cache efficiency and optimal worst-case
36  //performance.
37  template<class RandomAccessIter, class Unsigned_char_type>
38  inline void
39  update_offset(RandomAccessIter first, RandomAccessIter finish,
40  size_t &char_offset)
41  {
42  const int char_size = sizeof(Unsigned_char_type);
43  size_t nextOffset = char_offset;
44  int step_size = max_step_size;
45  while (true) {
46  RandomAccessIter curr = first;
47  do {
48  //Ignore empties, but if the nextOffset would exceed the length or
49  //not match, exit; we've found the last matching character
50  //This will reduce the step_size if the current step doesn't match.
51  if ((*curr).size() > char_offset) {
52  if((*curr).size() <= (nextOffset + step_size)) {
53  step_size = (*curr).size() - nextOffset - 1;
54  if (step_size < 1) {
55  char_offset = nextOffset;
56  return;
57  }
58  }
59  const int step_byte_size = step_size * char_size;
60  if (memcmp(curr->data() + nextOffset, first->data() + nextOffset,
61  step_byte_size) != 0) {
62  if (step_size == 1) {
63  char_offset = nextOffset;
64  return;
65  }
66  step_size = (step_size > 4) ? 4 : 1;
67  continue;
68  }
69  }
70  ++curr;
71  } while (curr != finish);
72  nextOffset += step_size;
73  }
74  }
75 
76  //Offsetting on identical characters. This function works a character
77  //at a time for optimal worst-case performance.
78  template<class RandomAccessIter, class Get_char, class Get_length>
79  inline void
80  update_offset(RandomAccessIter first, RandomAccessIter finish,
81  size_t &char_offset, Get_char get_character, Get_length length)
82  {
83  size_t nextOffset = char_offset;
84  while (true) {
85  RandomAccessIter curr = first;
86  do {
87  //ignore empties, but if the nextOffset would exceed the length or
88  //not match, exit; we've found the last matching character
89  if (length(*curr) > char_offset && (length(*curr) <= (nextOffset + 1)
90  || get_character((*curr), nextOffset) != get_character((*first), nextOffset))) {
91  char_offset = nextOffset;
92  return;
93  }
94  } while (++curr != finish);
95  ++nextOffset;
96  }
97  }
98 
99  //This comparison functor assumes strings are identical up to char_offset
100  template<class Data_type, class Unsigned_char_type>
102  offset_less_than(size_t char_offset) : fchar_offset(char_offset){}
103  inline bool operator()(const Data_type &x, const Data_type &y) const
104  {
105  size_t minSize = (std::min)(x.size(), y.size());
106  for (size_t u = fchar_offset; u < minSize; ++u) {
107  BOOST_STATIC_ASSERT(sizeof(x[u]) == sizeof(Unsigned_char_type));
108  if (static_cast<Unsigned_char_type>(x[u]) !=
109  static_cast<Unsigned_char_type>(y[u])) {
110  return static_cast<Unsigned_char_type>(x[u]) <
111  static_cast<Unsigned_char_type>(y[u]);
112  }
113  }
114  return x.size() < y.size();
115  }
116  size_t fchar_offset;
117  };
118 
119  //Compares strings assuming they are identical up to char_offset
120  template<class Data_type, class Unsigned_char_type>
122  offset_greater_than(size_t char_offset) : fchar_offset(char_offset){}
123  inline bool operator()(const Data_type &x, const Data_type &y) const
124  {
125  size_t minSize = (std::min)(x.size(), y.size());
126  for (size_t u = fchar_offset; u < minSize; ++u) {
127  BOOST_STATIC_ASSERT(sizeof(x[u]) == sizeof(Unsigned_char_type));
128  if (static_cast<Unsigned_char_type>(x[u]) !=
129  static_cast<Unsigned_char_type>(y[u])) {
130  return static_cast<Unsigned_char_type>(x[u]) >
131  static_cast<Unsigned_char_type>(y[u]);
132  }
133  }
134  return x.size() > y.size();
135  }
136  size_t fchar_offset;
137  };
138 
139  //This comparison functor assumes strings are identical up to char_offset
140  template<class Data_type, class Get_char, class Get_length>
142  offset_char_less_than(size_t char_offset) : fchar_offset(char_offset){}
143  inline bool operator()(const Data_type &x, const Data_type &y) const
144  {
145  size_t minSize = (std::min)(length(x), length(y));
146  for (size_t u = fchar_offset; u < minSize; ++u) {
147  if (get_character(x, u) != get_character(y, u)) {
148  return get_character(x, u) < get_character(y, u);
149  }
150  }
151  return length(x) < length(y);
152  }
153  size_t fchar_offset;
154  Get_char get_character;
155  Get_length length;
156  };
157 
158  //String sorting recursive implementation
159  template <class RandomAccessIter, class Unsigned_char_type>
160  inline void
161  string_sort_rec(RandomAccessIter first, RandomAccessIter last,
162  size_t char_offset,
163  std::vector<RandomAccessIter> &bin_cache,
164  unsigned cache_offset, size_t *bin_sizes)
165  {
166  typedef typename std::iterator_traits<RandomAccessIter>::value_type
167  Data_type;
168  //This section makes handling of long identical substrings much faster
169  //with a mild average performance impact.
170  //Iterate to the end of the empties. If all empty, return
171  while ((*first).size() <= char_offset) {
172  if (++first == last)
173  return;
174  }
175  RandomAccessIter finish = last - 1;
176  //Getting the last non-empty
177  for (;(*finish).size() <= char_offset; --finish);
178  ++finish;
179  //Offsetting on identical characters. This section works
180  //a few characters at a time for optimal worst-case performance.
181  update_offset<RandomAccessIter, Unsigned_char_type>(first, finish,
182  char_offset);
183 
184  const unsigned bin_count = (1 << (sizeof(Unsigned_char_type)*8));
185  //Equal worst-case of radix and comparison is when bin_count = n*log(n).
186  const unsigned max_size = bin_count;
187  const unsigned membin_count = bin_count + 1;
188  unsigned cache_end;
189  RandomAccessIter * bins = size_bins(bin_sizes, bin_cache, cache_offset,
190  cache_end, membin_count) + 1;
191 
192  //Calculating the size of each bin; this takes roughly 10% of runtime
193  for (RandomAccessIter current = first; current != last; ++current) {
194  if ((*current).size() <= char_offset) {
195  bin_sizes[0]++;
196  }
197  else
198  bin_sizes[static_cast<Unsigned_char_type>((*current)[char_offset])
199  + 1]++;
200  }
201  //Assign the bin positions
202  bin_cache[cache_offset] = first;
203  for (unsigned u = 0; u < membin_count - 1; u++)
204  bin_cache[cache_offset + u + 1] =
205  bin_cache[cache_offset + u] + bin_sizes[u];
206 
207  //Swap into place
208  RandomAccessIter next_bin_start = first;
209  //handling empty bins
210  RandomAccessIter * local_bin = &(bin_cache[cache_offset]);
211  next_bin_start += bin_sizes[0];
212  RandomAccessIter * target_bin;
213  //Iterating over each element in the bin of empties
214  for (RandomAccessIter current = *local_bin; current < next_bin_start;
215  ++current) {
216  //empties belong in this bin
217  while ((*current).size() > char_offset) {
218  target_bin =
219  bins + static_cast<Unsigned_char_type>((*current)[char_offset]);
220  iter_swap(current, (*target_bin)++);
221  }
222  }
223  *local_bin = next_bin_start;
224  //iterate backwards to find the last bin with elements in it
225  //this saves iterations in multiple loops
226  unsigned last_bin = bin_count - 1;
227  for (; last_bin && !bin_sizes[last_bin + 1]; --last_bin);
228  //This dominates runtime, mostly in the swap and bin lookups
229  for (unsigned u = 0; u < last_bin; ++u) {
230  local_bin = bins + u;
231  next_bin_start += bin_sizes[u + 1];
232  //Iterating over each element in this bin
233  for (RandomAccessIter current = *local_bin; current < next_bin_start;
234  ++current) {
235  //Swapping into place until the correct element has been swapped in
236  for (target_bin = bins + static_cast<Unsigned_char_type>
237  ((*current)[char_offset]); target_bin != local_bin;
238  target_bin = bins + static_cast<Unsigned_char_type>
239  ((*current)[char_offset])) iter_swap(current, (*target_bin)++);
240  }
241  *local_bin = next_bin_start;
242  }
243  bins[last_bin] = last;
244  //Recursing
245  RandomAccessIter lastPos = bin_cache[cache_offset];
246  //Skip this loop for empties
247  for (unsigned u = cache_offset + 1; u < cache_offset + last_bin + 2;
248  lastPos = bin_cache[u], ++u) {
249  size_t count = bin_cache[u] - lastPos;
250  //don't sort unless there are at least two items to Compare
251  if (count < 2)
252  continue;
253  //using std::sort if its worst-case is better
254  if (count < max_size)
255  std::sort(lastPos, bin_cache[u],
257  else
258  string_sort_rec<RandomAccessIter, Unsigned_char_type>(lastPos,
259  bin_cache[u], char_offset + 1, bin_cache, cache_end, bin_sizes);
260  }
261  }
262 
263  //Sorts strings in reverse order, with empties at the end
264  template <class RandomAccessIter, class Unsigned_char_type>
265  inline void
266  reverse_string_sort_rec(RandomAccessIter first, RandomAccessIter last,
267  size_t char_offset,
268  std::vector<RandomAccessIter> &bin_cache,
269  unsigned cache_offset,
270  size_t *bin_sizes)
271  {
272  typedef typename std::iterator_traits<RandomAccessIter>::value_type
273  Data_type;
274  //This section makes handling of long identical substrings much faster
275  //with a mild average performance impact.
276  RandomAccessIter curr = first;
277  //Iterate to the end of the empties. If all empty, return
278  while ((*curr).size() <= char_offset) {
279  if (++curr == last)
280  return;
281  }
282  //Getting the last non-empty
283  while ((*(--last)).size() <= char_offset);
284  ++last;
285  //Offsetting on identical characters. This section works
286  //a few characters at a time for optimal worst-case performance.
287  update_offset<RandomAccessIter, Unsigned_char_type>(first, last,
288  char_offset);
289  RandomAccessIter * target_bin;
290 
291  const unsigned bin_count = (1 << (sizeof(Unsigned_char_type)*8));
292  //Equal worst-case of radix and comparison when bin_count = n*log(n).
293  const unsigned max_size = bin_count;
294  const unsigned membin_count = bin_count + 1;
295  const unsigned max_bin = bin_count - 1;
296  unsigned cache_end;
297  RandomAccessIter * bins = size_bins(bin_sizes, bin_cache, cache_offset,
298  cache_end, membin_count);
299  RandomAccessIter * end_bin = &(bin_cache[cache_offset + max_bin]);
300 
301  //Calculating the size of each bin; this takes roughly 10% of runtime
302  for (RandomAccessIter current = first; current != last; ++current) {
303  if ((*current).size() <= char_offset) {
304  bin_sizes[bin_count]++;
305  }
306  else
307  bin_sizes[max_bin - static_cast<Unsigned_char_type>
308  ((*current)[char_offset])]++;
309  }
310  //Assign the bin positions
311  bin_cache[cache_offset] = first;
312  for (unsigned u = 0; u < membin_count - 1; u++)
313  bin_cache[cache_offset + u + 1] =
314  bin_cache[cache_offset + u] + bin_sizes[u];
315 
316  //Swap into place
317  RandomAccessIter next_bin_start = last;
318  //handling empty bins
319  RandomAccessIter * local_bin = &(bin_cache[cache_offset + bin_count]);
320  RandomAccessIter lastFull = *local_bin;
321  //Iterating over each element in the bin of empties
322  for (RandomAccessIter current = *local_bin; current < next_bin_start;
323  ++current) {
324  //empties belong in this bin
325  while ((*current).size() > char_offset) {
326  target_bin =
327  end_bin - static_cast<Unsigned_char_type>((*current)[char_offset]);
328  iter_swap(current, (*target_bin)++);
329  }
330  }
331  *local_bin = next_bin_start;
332  next_bin_start = first;
333  //iterate backwards to find the last non-empty bin
334  //this saves iterations in multiple loops
335  unsigned last_bin = max_bin;
336  for (; last_bin && !bin_sizes[last_bin]; --last_bin);
337  //This dominates runtime, mostly in the swap and bin lookups
338  for (unsigned u = 0; u < last_bin; ++u) {
339  local_bin = bins + u;
340  next_bin_start += bin_sizes[u];
341  //Iterating over each element in this bin
342  for (RandomAccessIter current = *local_bin; current < next_bin_start;
343  ++current) {
344  //Swapping into place until the correct element has been swapped in
345  for (target_bin =
346  end_bin - static_cast<Unsigned_char_type>((*current)[char_offset]);
347  target_bin != local_bin;
348  target_bin =
349  end_bin - static_cast<Unsigned_char_type>((*current)[char_offset]))
350  iter_swap(current, (*target_bin)++);
351  }
352  *local_bin = next_bin_start;
353  }
354  bins[last_bin] = lastFull;
355  //Recursing
356  RandomAccessIter lastPos = first;
357  //Skip this loop for empties
358  for (unsigned u = cache_offset; u <= cache_offset + last_bin;
359  lastPos = bin_cache[u], ++u) {
360  size_t count = bin_cache[u] - lastPos;
361  //don't sort unless there are at least two items to Compare
362  if (count < 2)
363  continue;
364  //using std::sort if its worst-case is better
365  if (count < max_size)
366  std::sort(lastPos, bin_cache[u], offset_greater_than<Data_type,
367  Unsigned_char_type>(char_offset + 1));
368  else
369  reverse_string_sort_rec<RandomAccessIter, Unsigned_char_type>
370  (lastPos, bin_cache[u], char_offset + 1, bin_cache, cache_end, bin_sizes);
371  }
372  }
373 
374  //String sorting recursive implementation
375  template <class RandomAccessIter, class Unsigned_char_type, class Get_char,
376  class Get_length>
377  inline void
378  string_sort_rec(RandomAccessIter first, RandomAccessIter last,
379  size_t char_offset, std::vector<RandomAccessIter> &bin_cache,
380  unsigned cache_offset, size_t *bin_sizes,
381  Get_char get_character, Get_length length)
382  {
383  typedef typename std::iterator_traits<RandomAccessIter>::value_type
384  Data_type;
385  //This section makes handling of long identical substrings much faster
386  //with a mild average performance impact.
387  //Iterate to the end of the empties. If all empty, return
388  while (length(*first) <= char_offset) {
389  if (++first == last)
390  return;
391  }
392  RandomAccessIter finish = last - 1;
393  //Getting the last non-empty
394  for (;length(*finish) <= char_offset; --finish);
395  ++finish;
396  update_offset(first, finish, char_offset, get_character, length);
397 
398  const unsigned bin_count = (1 << (sizeof(Unsigned_char_type)*8));
399  //Equal worst-case of radix and comparison is when bin_count = n*log(n).
400  const unsigned max_size = bin_count;
401  const unsigned membin_count = bin_count + 1;
402  unsigned cache_end;
403  RandomAccessIter * bins = size_bins(bin_sizes, bin_cache, cache_offset,
404  cache_end, membin_count) + 1;
405 
406  //Calculating the size of each bin; this takes roughly 10% of runtime
407  for (RandomAccessIter current = first; current != last; ++current) {
408  if (length(*current) <= char_offset) {
409  bin_sizes[0]++;
410  }
411  else
412  bin_sizes[get_character((*current), char_offset) + 1]++;
413  }
414  //Assign the bin positions
415  bin_cache[cache_offset] = first;
416  for (unsigned u = 0; u < membin_count - 1; u++)
417  bin_cache[cache_offset + u + 1] =
418  bin_cache[cache_offset + u] + bin_sizes[u];
419 
420  //Swap into place
421  RandomAccessIter next_bin_start = first;
422  //handling empty bins
423  RandomAccessIter * local_bin = &(bin_cache[cache_offset]);
424  next_bin_start += bin_sizes[0];
425  RandomAccessIter * target_bin;
426  //Iterating over each element in the bin of empties
427  for (RandomAccessIter current = *local_bin; current < next_bin_start;
428  ++current) {
429  //empties belong in this bin
430  while (length(*current) > char_offset) {
431  target_bin = bins + get_character((*current), char_offset);
432  iter_swap(current, (*target_bin)++);
433  }
434  }
435  *local_bin = next_bin_start;
436  //iterate backwards to find the last bin with elements in it
437  //this saves iterations in multiple loops
438  unsigned last_bin = bin_count - 1;
439  for (; last_bin && !bin_sizes[last_bin + 1]; --last_bin);
440  //This dominates runtime, mostly in the swap and bin lookups
441  for (unsigned ii = 0; ii < last_bin; ++ii) {
442  local_bin = bins + ii;
443  next_bin_start += bin_sizes[ii + 1];
444  //Iterating over each element in this bin
445  for (RandomAccessIter current = *local_bin; current < next_bin_start;
446  ++current) {
447  //Swapping into place until the correct element has been swapped in
448  for (target_bin = bins + get_character((*current), char_offset);
449  target_bin != local_bin;
450  target_bin = bins + get_character((*current), char_offset))
451  iter_swap(current, (*target_bin)++);
452  }
453  *local_bin = next_bin_start;
454  }
455  bins[last_bin] = last;
456 
457  //Recursing
458  RandomAccessIter lastPos = bin_cache[cache_offset];
459  //Skip this loop for empties
460  for (unsigned u = cache_offset + 1; u < cache_offset + last_bin + 2;
461  lastPos = bin_cache[u], ++u) {
462  size_t count = bin_cache[u] - lastPos;
463  //don't sort unless there are at least two items to Compare
464  if (count < 2)
465  continue;
466  //using std::sort if its worst-case is better
467  if (count < max_size)
468  std::sort(lastPos, bin_cache[u], offset_char_less_than<Data_type,
469  Get_char, Get_length>(char_offset + 1));
470  else
471  string_sort_rec<RandomAccessIter, Unsigned_char_type, Get_char,
472  Get_length>(lastPos, bin_cache[u], char_offset + 1, bin_cache,
473  cache_end, bin_sizes, get_character, length);
474  }
475  }
476 
477  //String sorting recursive implementation
478  template <class RandomAccessIter, class Unsigned_char_type, class Get_char,
479  class Get_length, class Compare>
480  inline void
481  string_sort_rec(RandomAccessIter first, RandomAccessIter last,
482  size_t char_offset, std::vector<RandomAccessIter> &bin_cache,
483  unsigned cache_offset, size_t *bin_sizes,
484  Get_char get_character, Get_length length, Compare comp)
485  {
486  //This section makes handling of long identical substrings much faster
487  //with a mild average performance impact.
488  //Iterate to the end of the empties. If all empty, return
489  while (length(*first) <= char_offset) {
490  if (++first == last)
491  return;
492  }
493  RandomAccessIter finish = last - 1;
494  //Getting the last non-empty
495  for (;length(*finish) <= char_offset; --finish);
496  ++finish;
497  update_offset(first, finish, char_offset, get_character, length);
498 
499  const unsigned bin_count = (1 << (sizeof(Unsigned_char_type)*8));
500  //Equal worst-case of radix and comparison is when bin_count = n*log(n).
501  const unsigned max_size = bin_count;
502  const unsigned membin_count = bin_count + 1;
503  unsigned cache_end;
504  RandomAccessIter * bins = size_bins(bin_sizes, bin_cache, cache_offset,
505  cache_end, membin_count) + 1;
506 
507  //Calculating the size of each bin; this takes roughly 10% of runtime
508  for (RandomAccessIter current = first; current != last; ++current) {
509  if (length(*current) <= char_offset) {
510  bin_sizes[0]++;
511  }
512  else
513  bin_sizes[get_character((*current), char_offset) + 1]++;
514  }
515  //Assign the bin positions
516  bin_cache[cache_offset] = first;
517  for (unsigned u = 0; u < membin_count - 1; u++)
518  bin_cache[cache_offset + u + 1] =
519  bin_cache[cache_offset + u] + bin_sizes[u];
520 
521  //Swap into place
522  RandomAccessIter next_bin_start = first;
523  //handling empty bins
524  RandomAccessIter * local_bin = &(bin_cache[cache_offset]);
525  next_bin_start += bin_sizes[0];
526  RandomAccessIter * target_bin;
527  //Iterating over each element in the bin of empties
528  for (RandomAccessIter current = *local_bin; current < next_bin_start;
529  ++current) {
530  //empties belong in this bin
531  while (length(*current) > char_offset) {
532  target_bin = bins + get_character((*current), char_offset);
533  iter_swap(current, (*target_bin)++);
534  }
535  }
536  *local_bin = next_bin_start;
537  //iterate backwards to find the last bin with elements in it
538  //this saves iterations in multiple loops
539  unsigned last_bin = bin_count - 1;
540  for (; last_bin && !bin_sizes[last_bin + 1]; --last_bin);
541  //This dominates runtime, mostly in the swap and bin lookups
542  for (unsigned u = 0; u < last_bin; ++u) {
543  local_bin = bins + u;
544  next_bin_start += bin_sizes[u + 1];
545  //Iterating over each element in this bin
546  for (RandomAccessIter current = *local_bin; current < next_bin_start;
547  ++current) {
548  //Swapping into place until the correct element has been swapped in
549  for (target_bin = bins + get_character((*current), char_offset);
550  target_bin != local_bin;
551  target_bin = bins + get_character((*current), char_offset))
552  iter_swap(current, (*target_bin)++);
553  }
554  *local_bin = next_bin_start;
555  }
556  bins[last_bin] = last;
557 
558  //Recursing
559  RandomAccessIter lastPos = bin_cache[cache_offset];
560  //Skip this loop for empties
561  for (unsigned u = cache_offset + 1; u < cache_offset + last_bin + 2;
562  lastPos = bin_cache[u], ++u) {
563  size_t count = bin_cache[u] - lastPos;
564  //don't sort unless there are at least two items to Compare
565  if (count < 2)
566  continue;
567  //using std::sort if its worst-case is better
568  if (count < max_size)
569  std::sort(lastPos, bin_cache[u], comp);
570  else
571  string_sort_rec<RandomAccessIter, Unsigned_char_type, Get_char,
572  Get_length, Compare>
573  (lastPos, bin_cache[u], char_offset + 1, bin_cache, cache_end,
574  bin_sizes, get_character, length, comp);
575  }
576  }
577 
578  //Sorts strings in reverse order, with empties at the end
579  template <class RandomAccessIter, class Unsigned_char_type, class Get_char,
580  class Get_length, class Compare>
581  inline void
582  reverse_string_sort_rec(RandomAccessIter first, RandomAccessIter last,
583  size_t char_offset, std::vector<RandomAccessIter> &bin_cache,
584  unsigned cache_offset, size_t *bin_sizes,
585  Get_char get_character, Get_length length, Compare comp)
586  {
587  //This section makes handling of long identical substrings much faster
588  //with a mild average performance impact.
589  RandomAccessIter curr = first;
590  //Iterate to the end of the empties. If all empty, return
591  while (length(*curr) <= char_offset) {
592  if (++curr == last)
593  return;
594  }
595  //Getting the last non-empty
596  while (length(*(--last)) <= char_offset);
597  ++last;
598  //Offsetting on identical characters. This section works
599  //a character at a time for optimal worst-case performance.
600  update_offset(curr, last, char_offset, get_character, length);
601 
602  const unsigned bin_count = (1 << (sizeof(Unsigned_char_type)*8));
603  //Equal worst-case of radix and comparison is when bin_count = n*log(n).
604  const unsigned max_size = bin_count;
605  const unsigned membin_count = bin_count + 1;
606  const unsigned max_bin = bin_count - 1;
607  unsigned cache_end;
608  RandomAccessIter * bins = size_bins(bin_sizes, bin_cache, cache_offset,
609  cache_end, membin_count);
610  RandomAccessIter *end_bin = &(bin_cache[cache_offset + max_bin]);
611 
612  //Calculating the size of each bin; this takes roughly 10% of runtime
613  for (RandomAccessIter current = first; current != last; ++current) {
614  if (length(*current) <= char_offset) {
615  bin_sizes[bin_count]++;
616  }
617  else
618  bin_sizes[max_bin - get_character((*current), char_offset)]++;
619  }
620  //Assign the bin positions
621  bin_cache[cache_offset] = first;
622  for (unsigned u = 0; u < membin_count - 1; u++)
623  bin_cache[cache_offset + u + 1] =
624  bin_cache[cache_offset + u] + bin_sizes[u];
625 
626  //Swap into place
627  RandomAccessIter next_bin_start = last;
628  //handling empty bins
629  RandomAccessIter * local_bin = &(bin_cache[cache_offset + bin_count]);
630  RandomAccessIter lastFull = *local_bin;
631  RandomAccessIter * target_bin;
632  //Iterating over each element in the bin of empties
633  for (RandomAccessIter current = *local_bin; current < next_bin_start;
634  ++current) {
635  //empties belong in this bin
636  while (length(*current) > char_offset) {
637  target_bin = end_bin - get_character((*current), char_offset);
638  iter_swap(current, (*target_bin)++);
639  }
640  }
641  *local_bin = next_bin_start;
642  next_bin_start = first;
643  //iterate backwards to find the last bin with elements in it
644  //this saves iterations in multiple loops
645  unsigned last_bin = max_bin;
646  for (; last_bin && !bin_sizes[last_bin]; --last_bin);
647  //This dominates runtime, mostly in the swap and bin lookups
648  for (unsigned u = 0; u < last_bin; ++u) {
649  local_bin = bins + u;
650  next_bin_start += bin_sizes[u];
651  //Iterating over each element in this bin
652  for (RandomAccessIter current = *local_bin; current < next_bin_start;
653  ++current) {
654  //Swapping into place until the correct element has been swapped in
655  for (target_bin = end_bin - get_character((*current), char_offset);
656  target_bin != local_bin;
657  target_bin = end_bin - get_character((*current), char_offset))
658  iter_swap(current, (*target_bin)++);
659  }
660  *local_bin = next_bin_start;
661  }
662  bins[last_bin] = lastFull;
663  //Recursing
664  RandomAccessIter lastPos = first;
665  //Skip this loop for empties
666  for (unsigned u = cache_offset; u <= cache_offset + last_bin;
667  lastPos = bin_cache[u], ++u) {
668  size_t count = bin_cache[u] - lastPos;
669  //don't sort unless there are at least two items to Compare
670  if (count < 2)
671  continue;
672  //using std::sort if its worst-case is better
673  if (count < max_size)
674  std::sort(lastPos, bin_cache[u], comp);
675  else
676  reverse_string_sort_rec<RandomAccessIter, Unsigned_char_type,
677  Get_char, Get_length, Compare>
678  (lastPos, bin_cache[u], char_offset + 1, bin_cache, cache_end,
679  bin_sizes, get_character, length, comp);
680  }
681  }
682 
683  //Holds the bin vector and makes the initial recursive call
684  template <class RandomAccessIter, class Unsigned_char_type>
685  inline typename boost::enable_if_c< sizeof(Unsigned_char_type) <= 2, void
686  >::type
687  string_sort(RandomAccessIter first, RandomAccessIter last,
688  Unsigned_char_type)
689  {
690  size_t bin_sizes[(1 << (8 * sizeof(Unsigned_char_type))) + 1];
691  std::vector<RandomAccessIter> bin_cache;
692  string_sort_rec<RandomAccessIter, Unsigned_char_type>
693  (first, last, 0, bin_cache, 0, bin_sizes);
694  }
695 
696  template <class RandomAccessIter, class Unsigned_char_type>
697  inline typename boost::disable_if_c< sizeof(Unsigned_char_type) <= 2, void
698  >::type
699  string_sort(RandomAccessIter first, RandomAccessIter last,
700  Unsigned_char_type)
701  {
702  //Warning that we're using std::sort, even though string_sort was called
703  BOOST_STATIC_WARNING( sizeof(Unsigned_char_type) <= 2 );
704  std::sort(first, last);
705  }
706 
707  //Holds the bin vector and makes the initial recursive call
708  template <class RandomAccessIter, class Unsigned_char_type>
709  inline typename boost::enable_if_c< sizeof(Unsigned_char_type) <= 2, void
710  >::type
711  reverse_string_sort(RandomAccessIter first, RandomAccessIter last,
712  Unsigned_char_type)
713  {
714  size_t bin_sizes[(1 << (8 * sizeof(Unsigned_char_type))) + 1];
715  std::vector<RandomAccessIter> bin_cache;
716  reverse_string_sort_rec<RandomAccessIter, Unsigned_char_type>
717  (first, last, 0, bin_cache, 0, bin_sizes);
718  }
719 
720  template <class RandomAccessIter, class Unsigned_char_type>
721  inline typename boost::disable_if_c< sizeof(Unsigned_char_type) <= 2, void
722  >::type
723  reverse_string_sort(RandomAccessIter first, RandomAccessIter last,
724  Unsigned_char_type)
725  {
726  typedef typename std::iterator_traits<RandomAccessIter>::value_type
727  Data_type;
728  //Warning that we're using std::sort, even though string_sort was called
729  BOOST_STATIC_WARNING( sizeof(Unsigned_char_type) <= 2 );
730  std::sort(first, last, std::greater<Data_type>());
731  }
732 
733  //Holds the bin vector and makes the initial recursive call
734  template <class RandomAccessIter, class Get_char, class Get_length,
735  class Unsigned_char_type>
736  inline typename boost::enable_if_c< sizeof(Unsigned_char_type) <= 2, void
737  >::type
738  string_sort(RandomAccessIter first, RandomAccessIter last,
739  Get_char get_character, Get_length length, Unsigned_char_type)
740  {
741  size_t bin_sizes[(1 << (8 * sizeof(Unsigned_char_type))) + 1];
742  std::vector<RandomAccessIter> bin_cache;
743  string_sort_rec<RandomAccessIter, Unsigned_char_type, Get_char,
744  Get_length>(first, last, 0, bin_cache, 0, bin_sizes, get_character, length);
745  }
746 
747  template <class RandomAccessIter, class Get_char, class Get_length,
748  class Unsigned_char_type>
749  inline typename boost::disable_if_c< sizeof(Unsigned_char_type) <= 2, void
750  >::type
751  string_sort(RandomAccessIter first, RandomAccessIter last,
752  Get_char get_character, Get_length length, Unsigned_char_type)
753  {
754  //Warning that we're using std::sort, even though string_sort was called
755  BOOST_STATIC_WARNING( sizeof(Unsigned_char_type) <= 2 );
756  std::sort(first, last);
757  }
758 
759  //Holds the bin vector and makes the initial recursive call
760  template <class RandomAccessIter, class Get_char, class Get_length,
761  class Compare, class Unsigned_char_type>
762  inline typename boost::enable_if_c< sizeof(Unsigned_char_type) <= 2, void
763  >::type
764  string_sort(RandomAccessIter first, RandomAccessIter last,
765  Get_char get_character, Get_length length, Compare comp, Unsigned_char_type)
766  {
767  size_t bin_sizes[(1 << (8 * sizeof(Unsigned_char_type))) + 1];
768  std::vector<RandomAccessIter> bin_cache;
769  string_sort_rec<RandomAccessIter, Unsigned_char_type, Get_char
770  , Get_length, Compare>
771  (first, last, 0, bin_cache, 0, bin_sizes, get_character, length, comp);
772  }
773 
774  //disable_if_c was refusing to compile, so rewrote to use enable_if_c
775  template <class RandomAccessIter, class Get_char, class Get_length,
776  class Compare, class Unsigned_char_type>
777  inline typename boost::enable_if_c< (sizeof(Unsigned_char_type) > 2), void
778  >::type
779  string_sort(RandomAccessIter first, RandomAccessIter last,
780  Get_char get_character, Get_length length, Compare comp, Unsigned_char_type)
781  {
782  //Warning that we're using std::sort, even though string_sort was called
783  BOOST_STATIC_WARNING( sizeof(Unsigned_char_type) <= 2 );
784  std::sort(first, last, comp);
785  }
786 
787  //Holds the bin vector and makes the initial recursive call
788  template <class RandomAccessIter, class Get_char, class Get_length,
789  class Compare, class Unsigned_char_type>
790  inline typename boost::enable_if_c< sizeof(Unsigned_char_type) <= 2, void
791  >::type
792  reverse_string_sort(RandomAccessIter first, RandomAccessIter last,
793  Get_char get_character, Get_length length, Compare comp, Unsigned_char_type)
794  {
795  size_t bin_sizes[(1 << (8 * sizeof(Unsigned_char_type))) + 1];
796  std::vector<RandomAccessIter> bin_cache;
797  reverse_string_sort_rec<RandomAccessIter, Unsigned_char_type, Get_char,
798  Get_length, Compare>
799  (first, last, 0, bin_cache, 0, bin_sizes, get_character, length, comp);
800  }
801 
802  template <class RandomAccessIter, class Get_char, class Get_length,
803  class Compare, class Unsigned_char_type>
804  inline typename boost::disable_if_c< sizeof(Unsigned_char_type) <= 2, void
805  >::type
806  reverse_string_sort(RandomAccessIter first, RandomAccessIter last,
807  Get_char get_character, Get_length length, Compare comp, Unsigned_char_type)
808  {
809  //Warning that we're using std::sort, even though string_sort was called
810  BOOST_STATIC_WARNING( sizeof(Unsigned_char_type) <= 2 );
811  std::sort(first, last, comp);
812  }
813  }
814 }
815 }
816 
817 #endif
void update_offset(RandomAccessIter first, RandomAccessIter finish, size_t &char_offset)
Definition: string_sort.hpp:39
Definition: constants.hpp:11
offset_char_less_than(size_t char_offset)
Definition: string_sort.hpp:142
bool operator()(const Data_type &x, const Data_type &y) const
Definition: string_sort.hpp:123
size_t fchar_offset
Definition: string_sort.hpp:136
Definition: string_sort.hpp:121
void string_sort_rec(RandomAccessIter first, RandomAccessIter last, size_t char_offset, std::vector< RandomAccessIter > &bin_cache, unsigned cache_offset, size_t *bin_sizes)
Definition: string_sort.hpp:161
void reverse_string_sort_rec(RandomAccessIter first, RandomAccessIter last, size_t char_offset, std::vector< RandomAccessIter > &bin_cache, unsigned cache_offset, size_t *bin_sizes)
Definition: string_sort.hpp:266
offset_less_than(size_t char_offset)
Definition: string_sort.hpp:102
size_t fchar_offset
Definition: string_sort.hpp:153
void string_sort(RandomAccessIter first, RandomAccessIter last, Unsigned_char_type unused)
Definition: string_sort.hpp:29
Definition: string_sort.hpp:141
void reverse_string_sort(RandomAccessIter first, RandomAccessIter last, Compare comp, Unsigned_char_type unused)
Definition: string_sort.hpp:49
Definition: string_sort.hpp:101
offset_greater_than(size_t char_offset)
Definition: string_sort.hpp:122
bool operator()(const Data_type &x, const Data_type &y) const
Definition: string_sort.hpp:103
size_t fchar_offset
Definition: string_sort.hpp:116
Get_length length
Definition: string_sort.hpp:155
RandomAccessIter * size_bins(size_t *bin_sizes, std::vector< RandomAccessIter > &bin_cache, unsigned cache_offset, unsigned &cache_end, unsigned bin_count)
Definition: spreadsort_common.hpp:106
Get_char get_character
Definition: string_sort.hpp:154
bool operator()(const Data_type &x, const Data_type &y) const
Definition: string_sort.hpp:143