4 * Copyright 2003 Max-Planck-Institut fuer Informatik
8 //======================================================================
9 // Multikey quicksort for suffixes
10 //======================================================================
12 #ifndef STRINGSORT_HPP
13 #define STRINGSORT_HPP
15 #include "partition.hpp"
23 //----------------------------------------------------------------------
24 // function string_compare
26 // Compare two strings until a mismatch but no longer than limit
27 // characters. The sign of the result indicates the order of the
28 // strings and the absolute value the distance of the mismatch from
29 // the limit. Zero means that no mismatch was found within limit
32 // WARNING: A check for the end of a string is user's responsibility.
33 //----------------------------------------------------------------------
34 template <typename StringIterator, typename DifferenceType,
37 string_compare(StringIterator s1, StringIterator s2,
38 DifferenceType limit, CharCompare less) {
39 for ( ; limit > 0; --limit, ++s1, ++s2) {
40 if (less(*s1,*s2)) return -limit;
41 if (less(*s2,*s1)) return limit;
45 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
46 template <typename StringIterator, typename DifferenceType>
48 string_compare(StringIterator s1, StringIterator s2, DifferenceType limit)
50 typedef typename std::iterator_traits<StringIterator>::value_type
52 return string_compare(s1, s2, limit, std::less<char_type>());
56 //----------------------------------------------------------------------
57 // const int suffix_insertion_sort_threshold_
59 // Groups of suffixes with size less or equal to this value are
60 // sorted by insertion sort.
61 //----------------------------------------------------------------------
62 static const int suffix_insertion_sort_threshold_ = 15;
64 //----------------------------------------------------------------------
65 // function suffix_insertion_sort
67 // Sort suffixes using insertion sort.
68 // A difference from a straightforward insertion sort is that
69 // the lengths of longest common prefixes of the already sorted
70 // suffixes are stored and used in later insertions.
71 //----------------------------------------------------------------------
72 template <typename StringIterator, typename SArrayIterator,
73 typename GroupHandler, typename CharCompare>
75 suffix_insertion_sort(StringIterator str, StringIterator str_end,
76 SArrayIterator sa, SArrayIterator sa_end,
77 typename std::iterator_traits<StringIterator>::difference_type limit,
78 GroupHandler handle_unsorted_group,
81 typedef typename std::iterator_traits<StringIterator>::difference_type
83 typedef typename std::iterator_traits<SArrayIterator>::difference_type
85 typedef typename std::iterator_traits<SArrayIterator>::value_type
88 static std::vector<str_diff_type> lcp(suffix_insertion_sort_threshold_);
89 lcp.resize(sa_end-sa);
92 for (sa_diff_type i = sa_end-sa-2; i >= 0; --i) {
93 pos_type i_pos = sa[i];
94 StringIterator i_str = str + i_pos;
95 StringIterator j_str = str + sa[i+1];
96 str_diff_type i_limit = std::min(limit, str_end - i_str);
97 str_diff_type ij_limit = std::min(i_limit, str_end - j_str);
98 str_diff_type dist = string_compare(i_str, j_str, ij_limit, less);
99 if (dist < 0 || (dist == 0 && i_limit == ij_limit)) {
100 lcp[i] = ij_limit + dist;
103 str_diff_type i_lcp = ij_limit - dist;
105 sa_diff_type j = i+1;
106 for ( ; lcp[j] >= i_lcp; ++j) {
107 if (lcp[j] == i_lcp) {
109 ij_limit = std::min(i_limit, str_end-j_str);
110 dist = string_compare(i_str+i_lcp, j_str+i_lcp,
111 ij_limit-i_lcp, less);
112 if (dist < 0 || (dist == 0 && i_limit == ij_limit)) {
113 lcp[j] = ij_limit + dist;
116 i_lcp = ij_limit - dist;
128 // find and process unsorted groups
129 typename std::vector<pos_type>::iterator beg, end, last = lcp.end()-1;
130 for (beg = lcp.begin(); beg < last; ++beg) {
133 do { ++end; } while (*end == limit);
134 handle_unsorted_group(sa+(beg-lcp.begin()), sa+(end-lcp.begin())+1);
139 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
140 template <typename StringIterator, typename SArrayIterator,
141 typename GroupHandler>
143 suffix_insertion_sort(StringIterator str, StringIterator str_end,
144 SArrayIterator sa, SArrayIterator sa_end,
145 typename std::iterator_traits<StringIterator>::difference_type limit,
146 GroupHandler handle_unsorted_group)
148 typedef typename std::iterator_traits<StringIterator>::value_type
150 suffix_insertion_sort(str, str_end, sa, sa_end, limit,
151 handle_unsorted_group, std::less<char_type>());
155 //----------------------------------------------------------------------
156 // functor char_access
158 // Compute the character that represents a suffix in comparisons.
159 //----------------------------------------------------------------------
160 template <typename StringIterator, typename SArrayIterator>
162 : public std::unary_function<SArrayIterator,
163 typename std::iterator_traits<StringIterator>::value_type>
166 const StringIterator str;
168 char_access(StringIterator s) : str(s) {}
170 typename std::iterator_traits<StringIterator>::value_type
171 operator() (SArrayIterator i) const {
177 //======================================================================
178 // function suffix_mkqsort
179 // function suffix_mkqsort_noempty
181 // Sort suffixes using multikey quicksort.
182 // These two functions call each other to implement multikey quicksort.
183 // Either can be used as the initial call. suffix_mkqsort_noempty
184 // places more restrictions on the input but can be slightly faster.
185 //======================================================================
187 //----------------------------------------------------------------------
188 // function suffix_mkqsort
190 // Sort suffixes using multikey quicksort.
191 // This is the more general form.
192 //----------------------------------------------------------------------
193 template <typename StringIterator, typename SArrayIterator,
194 typename GroupHandler, typename CharCompare>
196 suffix_mkqsort(StringIterator str, StringIterator str_end,
197 SArrayIterator sa, SArrayIterator sa_end,
198 typename std::iterator_traits<StringIterator>::difference_type limit,
199 GroupHandler handle_unsorted_group,
202 typedef typename std::iterator_traits<SArrayIterator>::value_type pos_type;
204 while (sa_end-sa > suffix_insertion_sort_threshold_ && limit > 0) {
206 // check for empty suffix(es)
207 pos_type length = static_cast<pos_type>(str_end-str);
208 sa = std::partition(sa, sa_end,
209 std::bind2nd(std::equal_to<pos_type>(), length));
211 // this would check only for one empty suffix
212 //SArrayIterator empty = std::find(sa, sa_end, length);
213 //if (empty != sa_end) {
214 // std::swap(*sa, *empty);
219 char_access<StringIterator, SArrayIterator> key(str);
220 SArrayIterator pivot = random_pivot(sa, sa_end, less, key);
221 std::pair<SArrayIterator,SArrayIterator> midrange;
222 midrange = ternary_partition(sa, sa_end, pivot, less, key);
225 if (midrange.first - sa > 1) {
226 suffix_mkqsort_noempty(str, str_end, sa, midrange.first,
227 limit, handle_unsorted_group, less);
231 if (sa_end - midrange.second > 1) {
232 suffix_mkqsort_noempty(str, str_end, midrange.second, sa_end,
233 limit, handle_unsorted_group, less);
238 sa_end = midrange.second;
240 // move to the next position in the suffixes
248 handle_unsorted_group(sa, sa_end);
250 suffix_insertion_sort(str, str_end, sa, sa_end, limit,
251 handle_unsorted_group, less);
255 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
256 template <typename StringIterator, typename SArrayIterator,
257 typename GroupHandler>
259 suffix_mkqsort(StringIterator str, StringIterator str_end,
260 SArrayIterator sa, SArrayIterator sa_end,
261 typename std::iterator_traits<StringIterator>::difference_type limit,
262 GroupHandler handle_unsorted_group)
264 typedef typename std::iterator_traits<StringIterator>::value_type
266 suffix_mkqsort(str, str_end, sa, sa_end, limit,
267 handle_unsorted_group, std::less<char_type>());
271 //----------------------------------------------------------------------
272 // function suffix_mkqsort_noempty
274 // Sort suffixes using multikey quicksort.
275 // Used as a subroutine of suffix_mkqsort but can also be called
276 // directly. Places more restrictions on input (see below) but
277 // can be slightly faster than suffix_mkqsort.
279 // The additional restrictions:
280 // PRECONDITION: limit>0 and there are no empty suffixes
281 //----------------------------------------------------------------------
282 template <typename StringIterator, typename SArrayIterator,
283 typename GroupHandler, typename CharCompare>
285 suffix_mkqsort_noempty(StringIterator str, StringIterator str_end,
286 SArrayIterator sa, SArrayIterator sa_end,
287 typename std::iterator_traits<StringIterator>::difference_type limit,
288 GroupHandler handle_unsorted_group,
291 typedef typename std::iterator_traits<SArrayIterator>::value_type pos_type;
293 while (sa_end-sa > suffix_insertion_sort_threshold_) {
296 char_access<StringIterator, SArrayIterator> key(str);
297 SArrayIterator pivot = random_pivot(sa, sa_end, less, key);
298 std::pair<SArrayIterator,SArrayIterator> midrange;
299 midrange = ternary_partition(sa, sa_end, pivot, less, key);
302 if (midrange.first - sa > 1) {
303 suffix_mkqsort_noempty(str, str_end, sa, midrange.first,
304 limit, handle_unsorted_group, less);
308 if (midrange.second - midrange.first > 1) {
309 suffix_mkqsort(str+1, str_end, midrange.first, midrange.second,
310 limit-1, handle_unsorted_group, less);
314 sa = midrange.second;
318 suffix_insertion_sort(str, str_end, sa, sa_end, limit,
319 handle_unsorted_group, less);
322 // - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
323 template <typename StringIterator, typename SArrayIterator,
324 typename GroupHandler>
326 suffix_mkqsort_noempty(StringIterator str, StringIterator str_end,
327 SArrayIterator sa, SArrayIterator sa_end,
328 typename std::iterator_traits<StringIterator>::difference_type limit,
329 GroupHandler handle_unsorted_group)
331 typedef typename std::iterator_traits<StringIterator>::value_type
333 suffix_mkqsort_noempty(str, str_end, sa, sa_end, limit,
334 handle_unsorted_group, std::less<char_type>());
337 #endif // STRINGSORT_HPP