BWT via Difference cover
[SXSI/TextCollection.git] / dcover / stringsort.hpp
diff --git a/dcover/stringsort.hpp b/dcover/stringsort.hpp
new file mode 100644 (file)
index 0000000..faef78f
--- /dev/null
@@ -0,0 +1,337 @@
+/* 
+ * stringsort.hpp
+ *
+ * Copyright 2003 Max-Planck-Institut fuer Informatik 
+ *
+ */
+
+//======================================================================
+// Multikey quicksort for suffixes
+//======================================================================
+
+#ifndef STRINGSORT_HPP
+#define STRINGSORT_HPP
+
+#include "partition.hpp"
+
+#include <algorithm>
+#include <iterator>
+#include <functional>
+#include <vector>
+
+
+//----------------------------------------------------------------------
+// function string_compare
+//
+// Compare two strings until a mismatch but no longer than limit 
+// characters.  The sign of the result indicates the order of the 
+// strings and the absolute value the distance of the mismatch from 
+// the limit. Zero means that no mismatch was found within limit 
+// characters.
+//
+// WARNING: A check for the end of a string is user's responsibility.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename DifferenceType,
+         typename CharCompare>
+DifferenceType
+string_compare(StringIterator s1, StringIterator s2, 
+              DifferenceType limit, CharCompare less) {
+  for ( ; limit > 0; --limit, ++s1, ++s2) {
+    if (less(*s1,*s2)) return -limit;
+    if (less(*s2,*s1)) return limit;
+  }
+  return 0;
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename DifferenceType>
+inline DifferenceType
+string_compare(StringIterator s1, StringIterator s2, DifferenceType limit) 
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  return string_compare(s1, s2, limit, std::less<char_type>());
+}
+
+
+//----------------------------------------------------------------------
+// const int suffix_insertion_sort_threshold_
+//
+// Groups of suffixes with size less or equal to this value are 
+// sorted by insertion sort.
+//----------------------------------------------------------------------
+static const int suffix_insertion_sort_threshold_ = 15;
+
+//----------------------------------------------------------------------
+// function suffix_insertion_sort
+// 
+// Sort suffixes using insertion sort.
+// A difference from a straightforward insertion sort is that
+// the lengths of longest common prefixes of the already sorted 
+// suffixes are stored and used in later insertions.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler, typename CharCompare>
+void
+suffix_insertion_sort(StringIterator str, StringIterator str_end,
+                     SArrayIterator sa, SArrayIterator sa_end,
+     typename std::iterator_traits<StringIterator>::difference_type limit,
+                     GroupHandler handle_unsorted_group,
+                     CharCompare less)
+{
+  typedef typename std::iterator_traits<StringIterator>::difference_type 
+    str_diff_type;
+  typedef typename std::iterator_traits<SArrayIterator>::difference_type 
+    sa_diff_type;
+  typedef typename std::iterator_traits<SArrayIterator>::value_type 
+    pos_type;
+  
+  static std::vector<str_diff_type> lcp(suffix_insertion_sort_threshold_);
+  lcp.resize(sa_end-sa);
+  lcp.back() = -1;
+  
+  for (sa_diff_type i = sa_end-sa-2; i >= 0; --i) {
+    pos_type i_pos = sa[i];
+    StringIterator i_str = str + i_pos;
+    StringIterator j_str = str + sa[i+1];
+    str_diff_type i_limit = std::min(limit, str_end - i_str);
+    str_diff_type ij_limit = std::min(i_limit, str_end - j_str);
+    str_diff_type dist = string_compare(i_str, j_str, ij_limit, less);
+    if (dist < 0 || (dist == 0 && i_limit == ij_limit)) {
+      lcp[i] = ij_limit + dist;
+    } else {
+      sa[i] = sa[i+1];
+      str_diff_type i_lcp = ij_limit - dist;
+
+      sa_diff_type j = i+1;
+      for ( ; lcp[j] >= i_lcp; ++j) {
+       if (lcp[j] == i_lcp) {
+         j_str = str+sa[j+1];
+         ij_limit = std::min(i_limit, str_end-j_str);
+         dist = string_compare(i_str+i_lcp, j_str+i_lcp, 
+                               ij_limit-i_lcp, less);
+         if (dist < 0 || (dist == 0 && i_limit == ij_limit)) {
+           lcp[j] = ij_limit + dist;
+           break;
+         } else {
+           i_lcp = ij_limit - dist;
+         }
+       }
+       sa[j] = sa[j+1];
+       lcp[j-1] = lcp[j];
+      }
+
+      sa[j] = i_pos;
+      lcp[j-1] = i_lcp;
+    }
+  }
+
+  // find and process unsorted groups
+  typename std::vector<pos_type>::iterator beg, end, last = lcp.end()-1;
+  for (beg = lcp.begin(); beg < last; ++beg) {
+    if (*beg == limit) {
+      end = beg;
+      do { ++end; } while (*end == limit);
+      handle_unsorted_group(sa+(beg-lcp.begin()), sa+(end-lcp.begin())+1);
+      beg = end;
+    }
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler>
+void
+suffix_insertion_sort(StringIterator str, StringIterator str_end,
+                     SArrayIterator sa, SArrayIterator sa_end,
+    typename std::iterator_traits<StringIterator>::difference_type limit,
+                     GroupHandler handle_unsorted_group)
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  suffix_insertion_sort(str, str_end, sa, sa_end, limit,
+                       handle_unsorted_group, std::less<char_type>());
+}
+
+
+//----------------------------------------------------------------------
+// functor char_access
+//
+// Compute the character that represents a suffix in comparisons.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator>
+class char_access
+  : public std::unary_function<SArrayIterator,
+          typename std::iterator_traits<StringIterator>::value_type>
+{
+private:
+  const StringIterator str;
+public:
+  char_access(StringIterator s) : str(s) {}
+
+  typename std::iterator_traits<StringIterator>::value_type
+  operator() (SArrayIterator i) const {
+    return str[*i]; 
+  }
+};
+
+
+//======================================================================
+// function suffix_mkqsort
+// function suffix_mkqsort_noempty
+//
+// Sort suffixes using multikey quicksort.
+// These two functions call each other to implement multikey quicksort.
+// Either can be used as the initial call. suffix_mkqsort_noempty
+// places more restrictions on the input but can be slightly faster.
+//======================================================================
+
+//----------------------------------------------------------------------
+// function suffix_mkqsort
+//
+// Sort suffixes using multikey quicksort.
+// This is the more general form.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler, typename CharCompare>
+void
+suffix_mkqsort(StringIterator str, StringIterator str_end,
+              SArrayIterator sa, SArrayIterator sa_end,
+    typename std::iterator_traits<StringIterator>::difference_type limit,
+              GroupHandler handle_unsorted_group,
+              CharCompare less)
+{                
+  typedef typename std::iterator_traits<SArrayIterator>::value_type pos_type;
+
+  while (sa_end-sa > suffix_insertion_sort_threshold_ && limit > 0) {
+
+    // check for empty suffix(es)
+    pos_type length = static_cast<pos_type>(str_end-str);
+    sa = std::partition(sa, sa_end, 
+                       std::bind2nd(std::equal_to<pos_type>(), length));
+
+    // this would check only for one empty suffix
+    //SArrayIterator empty = std::find(sa, sa_end, length);
+    //if (empty != sa_end) {
+    //  std::swap(*sa, *empty);
+    //  ++sa;
+    //}
+
+    // partition
+    char_access<StringIterator, SArrayIterator> key(str);
+    SArrayIterator pivot = random_pivot(sa, sa_end, less, key);
+    std::pair<SArrayIterator,SArrayIterator> midrange;
+    midrange = ternary_partition(sa, sa_end, pivot, less, key);
+
+    // recurse on <-part
+    if (midrange.first - sa > 1) {
+      suffix_mkqsort_noempty(str, str_end, sa, midrange.first, 
+                            limit, handle_unsorted_group, less);
+    }
+
+    // recurse on >-part
+    if (sa_end - midrange.second > 1) {
+      suffix_mkqsort_noempty(str, str_end, midrange.second, sa_end, 
+                            limit, handle_unsorted_group, less);
+    }
+
+    // loop on =-part
+    sa = midrange.first;
+    sa_end = midrange.second;
+
+    // move to the next position in the suffixes
+    ++str;
+    --limit;
+
+  }
+
+  if (sa_end-sa > 1) {
+    if (limit == 0) {
+      handle_unsorted_group(sa, sa_end);
+    } else {
+      suffix_insertion_sort(str, str_end, sa, sa_end, limit, 
+                           handle_unsorted_group, less);
+    }
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler>
+void
+suffix_mkqsort(StringIterator str, StringIterator str_end,
+              SArrayIterator sa, SArrayIterator sa_end,
+     typename std::iterator_traits<StringIterator>::difference_type limit,
+              GroupHandler handle_unsorted_group)
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  suffix_mkqsort(str, str_end, sa, sa_end, limit,
+                handle_unsorted_group, std::less<char_type>());
+}
+
+
+//----------------------------------------------------------------------
+// function suffix_mkqsort_noempty
+//
+// Sort suffixes using multikey quicksort.
+// Used as a subroutine of suffix_mkqsort but can also be called 
+// directly.  Places more restrictions on input (see below) but
+// can be slightly faster than suffix_mkqsort.
+//
+// The additional restrictions:
+// PRECONDITION: limit>0 and there are no empty suffixes
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler, typename CharCompare>
+void
+suffix_mkqsort_noempty(StringIterator str, StringIterator str_end,
+                      SArrayIterator sa, SArrayIterator sa_end,
+    typename std::iterator_traits<StringIterator>::difference_type limit,
+                      GroupHandler handle_unsorted_group,
+                      CharCompare less)
+{                
+  typedef typename std::iterator_traits<SArrayIterator>::value_type pos_type;
+
+  while (sa_end-sa > suffix_insertion_sort_threshold_) {
+
+    // partition
+    char_access<StringIterator, SArrayIterator> key(str);
+    SArrayIterator pivot = random_pivot(sa, sa_end, less, key);
+    std::pair<SArrayIterator,SArrayIterator> midrange;
+    midrange = ternary_partition(sa, sa_end, pivot, less, key);
+
+    // recurse on <-part
+    if (midrange.first - sa > 1) {
+      suffix_mkqsort_noempty(str, str_end, sa, midrange.first, 
+                            limit, handle_unsorted_group, less);
+    }
+
+    // recurse on =-part
+    if (midrange.second - midrange.first > 1) {
+      suffix_mkqsort(str+1, str_end, midrange.first, midrange.second, 
+                    limit-1, handle_unsorted_group, less);
+    }
+
+    // loop on >-part
+    sa = midrange.second;
+  }
+
+  if (sa_end-sa > 1) {
+    suffix_insertion_sort(str, str_end, sa, sa_end, limit, 
+                         handle_unsorted_group, less);
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler>
+void
+suffix_mkqsort_noempty(StringIterator str, StringIterator str_end,
+                      SArrayIterator sa, SArrayIterator sa_end,
+      typename std::iterator_traits<StringIterator>::difference_type limit,
+                      GroupHandler handle_unsorted_group)
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  suffix_mkqsort_noempty(str, str_end, sa, sa_end, limit,
+                        handle_unsorted_group, std::less<char_type>());
+}
+    
+#endif // STRINGSORT_HPP