BWT via Difference cover

author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Fri, 6 Mar 2009 13:39:13 +0000 (13:39 +0000)

committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Fri, 6 Mar 2009 13:39:13 +0000 (13:39 +0000)
author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Fri, 6 Mar 2009 13:39:13 +0000 (13:39 +0000)
committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Fri, 6 Mar 2009 13:39:13 +0000 (13:39 +0000)
diff --git a/dcover/bwt.hpp b/dcover/bwt.hpp

new file mode 100644 (file)

index 0000000..b717226
--- /dev/null
+++ b/dcover/bwt.hpp
@@ -0,0 +1,456 @@
+/*
+ * bwt.hpp
+ *
+ * Copyright 2004 Juha Ka"rkka"inen <juha.karkkainen@cs.helsinki.fi>
+ *
+ */
+
+#include "dcsuffixsort.hpp"
+#include "dcsamplesort.hpp"
+#include "stringsort.hpp"
+#include "difference_cover.hpp"
+#include "distribute.hpp"
+#include "kmpsplit.hpp"
+/*
+#include "doubling.hpp"
+*/
+
+#include <iterator>
+#include <vector>
+#include <set>
+#include <iostream>
+#include <iomanip>
+#include <cassert>
+/*
+#include <algorithm>
+#include <cstdlib>
+*/
+
+#ifndef DCOVER
+#define DCOVER 8
+#endif
+
+#ifndef RANDSEED
+#define RANDSEED 46
+#endif
+
+#define MIN_PIVOTS 3
+#define OVERSAMPLING_RATE 10.0
+
+
+struct do_nothing {
+  template <typename Iterator>
+  void operator() (Iterator, Iterator) {}
+};
+
+
+
+
+template <class StringIterator, class OutputIterator>
+typename std::iterator_traits<StringIterator>::difference_type
+compute_bwt (StringIterator str, StringIterator str_end, 
+            OutputIterator bwt, unsigned endmarkers, unsigned int dcover = DCOVER)
+{
+
+  typedef typename std::iterator_traits<StringIterator>::difference_type 
+                        difftype;   // signed integral type    
+  typedef unsigned long sizetype;   // unsigned integral type
+  typedef difftype postype;   // type for representing vector positions
+
+  typedef typename std::vector<postype>::iterator pos_iterator;
+
+#if DEBUG > 0
+  std::cout << "BWT construction starts" << std::endl;
+#endif
+
+  //-----------------
+  // setup and check
+  //-----------------
+
+  sizetype length = std::distance(str, str_end);
+
+#if DEBUG > 1
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    chartype;
+  typedef typename std::iterator_traits<OutputIterator>::value_type
+    bwttype;
+  std::cout << " text memory=" << length*sizeof(chartype) << "\n"
+           << " bwt memory=" << (length+1)*sizeof(chartype) << std::endl;
+#endif
+
+  dc_sampler<postype> dcsampler(length+1, dcover);
+
+#if DEBUG > 1
+  std::cout << " dcsampler memory=" 
+           << (dcsampler.packed_period() + 2*dcsampler.period()) 
+                * sizeof(unsigned)
+           << std::endl;
+#endif
+
+  sizetype blocksize = dcsampler.samplesize();
+  sizetype npivots = MIN_PIVOTS;
+
+  // make average bucket size <= blocksize/OVERSAMPLING_RATE
+  while ((OVERSAMPLING_RATE*length)/(npivots+1.0) > blocksize) ++npivots;
+
+#if DEBUG > 1
+  std::cout << " length=" << length 
+           << " samplesize=" << dcsampler.samplesize()
+           << " blocksize=" << blocksize
+           << " npivots=" << npivots
+           << std::endl;
+#endif
+
+  if (dcsampler.period() + 2*npivots > length) {
+    // too small text for sampling
+
+#if DEBUG > 0
+    std::cout << "small text: sort all suffixes\n";
+#endif
+
+    std::vector<postype> sa(length);
+    for (pos_iterator i=sa.begin(); i!=sa.end(); ++i) {
+      *i = i - sa.begin();
+    }
+    suffix_mkqsort(str, str_end, sa.begin(), sa.end(), length, do_nothing());
+    difftype eof_position = 0;
+    for (pos_iterator i=sa.begin(); i!=sa.end(); ++i) {
+      if (*i != 0) {
+          if (str[*i-1] <= endmarkers)
+              *bwt++ = 0;
+          else
+              *bwt++ = str[*i-1] - endmarkers;
+      } else {
+          if (str[*(i-1)-1] <= endmarkers)
+              *bwt++ = 0;
+          else
+              *bwt++ = str[*(i-1)-1] - endmarkers;
+          eof_position = i - sa.begin() +1; // FIXME
+      }
+    }
+    return eof_position;
+  }
+
+  // don't take too short suffixes as pivots
+  unsigned int pivotrange = length-dcsampler.period()+1;
+
+  //-----------------------------------
+  // setup difference cover sample
+  //-----------------------------------
+
+#if DEBUG > 0
+  std::cout << "difference cover construction ... " << std::endl;
+#endif
+
+  sizetype samplesize = dcsampler.samplesize();
+  std::vector<difftype> dcranks(samplesize);
+  std::vector<difftype> suffixes(samplesize);
+
+#if DEBUG > 1
+  std::cout << "dcranks memory=" 
+           << samplesize * sizeof(difftype)
+           << "\n";
+  std::cout << "suffixes memory=" 
+           << samplesize * sizeof(difftype)
+           << std::endl;
+#endif
+
+  sort_dc_sample(dcsampler, str, str_end,
+                dcranks.begin(), dcranks.end(),
+                 suffixes.begin(), suffixes.end());
+
+#if DEBUG > 3
+  std::cout << "sample ranks:\n";
+  std::copy(dcranks.begin(), dcranks.end(),
+           std::ostream_iterator<int>(std::cout,"\n"));
+  std::cout << "end of sample ranks" << std::endl;
+#endif
+
+
+
+  //-------------------  
+  // setup pivots
+  //-------------------
+
+#if DEBUG > 0
+  std::cout << "setting up pivots ..." << std::endl;
+#endif
+
+  std::set<difftype> pivotset;
+  double scale_factor = (pivotrange * 1.0L)/(RAND_MAX+1.0L);
+  std::srand(RANDSEED);
+  while (pivotset.size() < npivots) {
+    int r = std::rand();
+    int newpivot = static_cast<difftype>(scale_factor*r);
+    pivotset.insert(newpivot);
+  }
+
+#if DEBUG > 1
+  std::cout << " pivotset memory = " << pivotset.size()
+           << " elements" << std::endl;
+#endif
+
+  std::vector<difftype> pivots(pivotset.begin(), pivotset.end());
+  pivotset.clear();
+
+  // sort the pivots
+  dc_sort_suffixes(str, str_end, pivots.begin(), pivots.end(),
+                  dcsampler, dcranks.begin(), dcranks.end());
+
+#if DEBUG > 2
+  std::cout << "pivots sorted lexicographically:\n";
+  for (pos_iterator i = pivots.begin(); i != pivots.end(); ++i) {
+    std::cout << std::setw(8) << *i << " ";
+    std::copy(str+*i, std::min(str_end, str+*i+40),
+             std::ostream_iterator<char>(std::cout,""));
+    std::cout << std::endl;
+  }
+#endif
+
+
+#if DEBUG > 0
+  std::cout << " constructing distributor ... " << std::endl;
+#endif
+
+  suffix_distributor<StringIterator, pos_iterator, 
+                     dc_sampler<postype>, pos_iterator>
+    distributor(str, str_end, pivots.begin(), pivots.end(), 
+               dcsampler, dcranks.begin());
+  
+#if DEBUG > 1
+  distributor.print();
+#endif
+
+
+
+  //------------------------
+  // compute bucketsizes
+  //------------------------
+
+#if DEBUG > 0
+  std::cout << "computing bucket sizes ... " << std::endl;
+#endif
+
+  std::vector<difftype> buckets(distributor.nbuckets());
+
+#if DEBUG > 1
+  std::cout << " number of buckets: " << distributor.nbuckets() << "\n";
+  std::cout << " bucket vector memory=" 
+           << buckets.size() * sizeof(difftype)
+           << std::endl;
+#endif
+
+  for (StringIterator suf = str; suf != str_end; ++suf) {
+    unsigned int bucket = distributor.find_bucket(suf);
+    ++buckets[bucket];
+
+#if DEBUG > 3
+    std::cout << " into bucket " << bucket;
+    std::cout << ":" << std::setw(7) << suf-str << " ";
+    std::copy(suf, std::min(str_end, suf+40),
+             std::ostream_iterator<char>(std::cout,""));
+    std::cout << std::endl;
+#endif
+
+  }
+  // ++buckets[0]; // empty suffix
+
+#if DEBUG > 1
+  std::cout << " bucket sizes: ";
+  std::copy(buckets.begin(), buckets.end(),
+           std::ostream_iterator<int>(std::cout," "));
+  std::cout << std::endl;
+#endif
+
+
+  //-----------------------
+  // build and sort blocks
+  //-----------------------
+
+#if DEBUG > 0
+  std::cout << "Computing and sorting blocks ... " << std::endl;
+#endif
+
+//  *bwt++ = *(str_end-1);  //FIXME no empty suffix?
+
+  postype eof_position = 0;
+  sizetype total_sum = 1;  
+
+  postype bucket_begin = 0;
+  while (bucket_begin < (postype)buckets.size()) {
+
+    sizetype sum = buckets[bucket_begin];
+    postype bucket_end = bucket_begin + 1;
+    while (bucket_end < (postype)buckets.size() 
+          && sum+buckets[bucket_end] <= blocksize) {
+      sum += buckets[bucket_end];
+      ++bucket_end;
+    }
+    
+    postype left_pivot = bucket_begin-1;
+    postype right_pivot = bucket_end-1;
+
+    short lcp = 0;
+    if (left_pivot >= 0 && right_pivot < (postype)pivots.size()) {
+#if __WORDSIZE == 32
+      lcp = suffix_lcp(str+pivots[left_pivot], str+pivots[right_pivot], 
+                      str_end, 0, dcsampler.period()-1);
+#else
+      lcp = suffix_lcp(str+pivots[left_pivot], str+pivots[right_pivot], 
+               str_end, 0l, dcsampler.period()-1);
+#endif
+    }
+
+#if DEBUG > 1
+    std::cout << "block [" << bucket_begin << "," << bucket_end << ")"
+             << " size=" << sum
+             << " lcp=" << lcp
+             << std::endl;
+#endif
+
+    if (sum > blocksize) {
+      blocksize = sum;
+      suffixes.resize(blocksize);
+
+#if DEBUG > 1
+      std::cout << " blocksize increased to "
+               << blocksize << "\n";
+      std::cout << " suffixes memory resized to "
+               << blocksize*sizeof(postype) 
+               << std::endl;
+#endif
+
+    }
+
+    pos_iterator i = suffixes.begin();
+    //if (bucket_begin == 0) { *i++ = length;       // empty suffix } 
+
+    postype left_pivot_not_beyond = left_pivot<0 ? 0 : left_pivot;
+    kmp_split<StringIterator, dc_sampler<postype>, pos_iterator> 
+      left_cmp(str, str_end, str+pivots[left_pivot_not_beyond],
+                  dcsampler, dcranks.begin());
+
+    postype right_pivot_not_beyond = 
+      right_pivot==(postype)pivots.size() ? right_pivot-1 : right_pivot;
+    kmp_split<StringIterator, dc_sampler<postype>, pos_iterator> 
+      right_cmp(str, str_end, str+pivots[right_pivot_not_beyond],
+                  dcsampler, dcranks.begin());
+
+    for (StringIterator suf = str; suf != str_end; ++suf) {
+      
+#if DEBUG > 3
+      std::cout << ":" << std::setw(3) << suf-str << " ";
+      std::copy(suf, std::min(str_end, suf+40),
+               std::ostream_iterator<char>(std::cout,""));
+      std::cout << std::endl;
+#endif
+
+      /*
+      // is the suffix left of the block
+      if (left_pivot >=0) {
+       postype pivotpos = pivots[left_pivot];
+       if (*suf < str[pivotpos]) continue;
+       int lcp = suffix_lcp(suf, str+pivotpos, 
+                            str_end, 0, dcsampler.period()-1);
+       if (lcp < dcsampler.period()-1) {
+         if (suf+lcp == str_end || suf[lcp] < (str+pivotpos)[lcp]) {
+           continue;
+         }
+       } else {
+         postype sufpos = suf-str;
+         std::pair<postype,postype> result 
+           = dcsampler.pair(sufpos, pivotpos);
+         if (dcranks[result.first] < dcranks[result.second]){
+           continue;
+         }
+       }
+      }
+
+      // is the suffix right of the block
+      if (right_pivot < (postype)pivots.size()) {
+       postype pivotpos = pivots[right_pivot];
+       if (*suf > str[pivotpos]) continue;
+       int lcp = suffix_lcp(str+pivotpos, suf,
+                            str_end, 0, dcsampler.period()-1);
+       if (lcp < dcsampler.period()-1) {
+         if (suf+lcp < str_end && (str+pivotpos)[lcp] < suf[lcp]) {
+           continue;
+         }
+       } else {
+         postype sufpos = suf-str;
+         std::pair<postype,postype> result 
+           = dcsampler.pair(pivotpos, sufpos);
+         if (dcranks[result.first] <= dcranks[result.second]){
+           continue;
+         }
+       }
+      }
+      */
+
+      //postype bucket = distributor.find_bucket(suf);
+
+      bool to_left = left_pivot>0 && left_cmp.is_next_less(); 
+      bool to_right = right_pivot<(postype)pivots.size() && !right_cmp.is_next_less() ;
+
+      if (to_left || to_right) {
+       //assert (bucket_begin > bucket || bucket >= bucket_end);
+       continue;
+      }
+
+      // it is in the bucket
+      //postype bucket = distributor.find_bucket(suf);
+
+      //if (bucket_begin > bucket || bucket >= bucket_end) continue;
+
+      //assert (bucket_begin <= bucket && bucket < bucket_end);
+
+      *i++ = suf-str;
+#if DEBUG > 3
+      //std::cout << "into bucket " << bucket;
+      std::cout << ":" << std::setw(3) << suf-str << " ";
+      std::copy(suf, std::min(str_end, suf+40),
+               std::ostream_iterator<char>(std::cout,""));
+      std::cout << std::endl;
+#endif
+      
+    }
+
+    assert(i == suffixes.begin()+sum);    
+
+    dc_sort_suffixes(str, str_end, suffixes.begin(), i,
+                    dcsampler, dcranks.begin(), dcranks.end());
+
+#if DEBUG > 3
+    std::cout << "sorted bucket:\n";
+    for (pos_iterator j = suffixes.begin(); j != i; ++j) {
+      std::cout << std::setw(3) << *j << " ";
+      std::copy(str+*j, std::min(str_end, str+*j+40),
+               std::ostream_iterator<char>(std::cout,""));
+      std::cout << std::endl;
+    }
+#endif
+
+    for (pos_iterator j=suffixes.begin(); j != i; ++j) {
+      if (*j != 0) {
+          if (str[*j-1] <= endmarkers) 
+              *bwt++ = 0;
+          else
+              *bwt++ = str[*j-1] - endmarkers;
+      } else {
+          if (str[*(j-1)-1] <= endmarkers)
+              *bwt++ = 0;
+          else
+              *bwt++ = str[*(j-1)-1] - endmarkers;
+       eof_position = total_sum + (j - suffixes.begin());
+      }
+    }
+    
+    total_sum += sum;
+
+    bucket_begin = bucket_end;
+  }
+
+  assert(total_sum == length+1);
+
+  return eof_position;
+}
diff --git a/dcover/dcsamplesort.hpp b/dcover/dcsamplesort.hpp

new file mode 100644 (file)

index 0000000..507cfad
--- /dev/null
+++ b/dcover/dcsamplesort.hpp
@@ -0,0 +1,110 @@
+/*
+ * dcsamplesort.hpp
+ *
+ * Copyright 2003 Max-Planck-Institut fuer Informatik
+ * Copyright 2004 Juha Ka"rkka"inen <juha.karkkainen@cs.helsinki.fi>
+ *
+ */
+
+#ifndef DCSAMPLESORT_HPP
+#define DCSAMPLESORT_HPP
+
+#include "difference_cover.hpp"
+#include "doubling.hpp"
+#include "stringsort.hpp"
+/*
+*/
+
+#include <sstream>
+#include <iostream>
+/*
+#include <algorithm>
+#include <iterator>
+*/
+
+
+//----------------------------------------------------------------------
+// functor mark_unsorted_group
+//
+// When string sorting leaves a group of suffixes unsorted, 
+// it marks that group by calling this functor.
+//
+// The marking is done by one's complementing the first and last
+// entries of the group. Complemented values are assumed to become
+// negative. Complementing is used instead of negation to be able
+// mark the zero.
+//----------------------------------------------------------------------
+class mark_unsorted_group {
+public:
+  template <typename SArrayIterator>
+  void operator() (SArrayIterator beg, SArrayIterator end) const { 
+    *beg = ~*beg; 
+    --end;
+    *end = ~*end;  // note: group of size one stays unmarked
+  }
+};
+
+
+
+
+template <class Sampler, class RandomAccessIterator1,
+          class RandomAccessIterator2, class RandomAccessIterator3>
+void
+sort_dc_sample(Sampler sampler,
+              RandomAccessIterator1 str, RandomAccessIterator1 str_end,
+              RandomAccessIterator2 ranks, RandomAccessIterator2 ranks_end,
+              RandomAccessIterator3 suffixes, 
+              RandomAccessIterator3 suffixes_end)
+{
+  int size = sampler.samplesize();
+  if ((ranks_end-ranks != size) || (suffixes_end-suffixes != size)) {
+    std::ostringstream os;
+    os << "sort_dc_sample:"
+       << " wrong range size";
+    throw std::logic_error(os.str());
+  }
+
+  // collect the sample
+  RandomAccessIterator3 beyond = sampler.fill(suffixes);
+  if (suffixes_end != beyond) {
+    std::ostringstream os;
+    os << "sort_dc_sample:"
+       << " sample.fill produced " << beyond-suffixes
+       << " samples but samplesize is " << size;
+    throw std::logic_error(os.str());
+  }
+    
+  // sort the sample to limited depth
+  suffix_mkqsort(str, str_end, suffixes, suffixes_end, sampler.period(),
+                mark_unsorted_group());
+
+  // compute ranks
+  RandomAccessIterator3 first = suffixes;
+  while (first != suffixes_end) {
+    if (*first >= 0) {
+      RandomAccessIterator3 last;
+      ranks[sampler.pack(*first)] = first - suffixes;
+      for (last=first+1; last != suffixes_end && *last >= 0; ++last) {
+       ranks[sampler.pack(*last)] = last - suffixes;
+      }
+      *first = -(last-first);
+      first = last;
+    } else {
+      RandomAccessIterator3 last;
+      *first = ~*first;
+      for (last=first+1; *last >= 0; ++last) ;
+      *last = ~*last;
+      for ( ; first <= last; ++first) {
+       *first = sampler.pack(*first);
+       ranks[*first] = last - suffixes;
+      }
+    }
+  }
+
+  // complete sorting with the doubling algorithm
+  doubling_sort(suffixes, suffixes_end, ranks, ranks_end, 
+               sampler.packed_period());
+
+}
+
+#endif // DCSAMPLESORT_HPP
diff --git a/dcover/dcsuffixsort.hpp b/dcover/dcsuffixsort.hpp

new file mode 100644 (file)

index 0000000..b1dcfbc
--- /dev/null
+++ b/dcover/dcsuffixsort.hpp
@@ -0,0 +1,131 @@
+/*
+ * dcsuffixsort.hpp
+ *
+ * Copyright 2003 Max-Planck-Institut fuer Informatik
+ * Copyright 2004 Juha Ka"rkka"inen <juha.karkkainen@cs.helsinki.fi>
+ *
+ */
+
+#ifndef DCSUFFIXSORT_HPP
+#define DCSUFFIXSORT_HPP
+
+#include "stringsort.hpp"
+#include "difference_cover.hpp"
+#include "partition.hpp"
+/*
+#include "doubling.hpp"
+#include "stringsort.hpp"
+#include "checker.hpp"
+#include "timing.hpp"
+*/
+
+#include <utility>
+#include <iterator>
+#include <stdexcept>
+#include <sstream>
+/*
+#include <algorithm>
+#include <vector>
+#include <string>
+#include <functional>
+*/
+
+//----------------------------------------------------------------------
+// dc_suffix_compare
+//
+// compare two suffixes using difference covers
+//----------------------------------------------------------------------
+template <typename RankIterator>
+class dc_suffix_compare {
+private:
+  typedef typename std::iterator_traits<RankIterator>::value_type 
+          rank_type;
+  const RankIterator sample_ranks;
+  const dc_sampler<rank_type>& sample;
+public:
+  dc_suffix_compare(RankIterator beg, const dc_sampler<rank_type>& s) 
+    : sample_ranks(beg), sample(s) {}
+  template <typename StringPosType>
+  bool operator() (StringPosType a, StringPosType b) const {
+    std::pair<rank_type,rank_type> rep = sample.pair(a, b);
+    return sample_ranks[rep.first] < sample_ranks[rep.second];
+  }
+};
+
+
+//----------------------------------------------------------------------
+// functor mark_unsorted_group
+//
+// When string sorting leaves a group of suffixes unsorted, 
+// it marks that group by calling this functor.
+//
+// The marking is done by one's complementing the first and last
+// entries of the group. Complemented values are assumed to become
+// negative. Complementing is used instead of negation to be able
+// mark the zero.
+//----------------------------------------------------------------------
+class mark_unsorted_group2 {
+public:
+  template <typename SArrayIterator>
+  void operator() (SArrayIterator beg, SArrayIterator end) const { 
+    *beg = ~*beg; 
+    --end;
+    *end = ~*end;  // note: group of size one stays unmarked
+  }
+};
+
+/*
+//----------------------------------------------------------------------
+// functor do_nothing
+//
+// Replaces mark_unsorted_group when nothing needs to be done
+// to unsorted groups (this happens when doing stringsort only)
+//----------------------------------------------------------------------
+class do_nothing {
+public:
+  template <typename Iterator>
+  void operator() (Iterator, Iterator) {}
+};
+*/
+
+
+//----------------------------------------------------------------------
+// function dc_sort_suffixes
+// 
+// Sort a set of suffixes using a difference cover sample.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename DCSampler, typename RankIterator>
+void
+dc_sort_suffixes(StringIterator str, StringIterator str_end,
+             SArrayIterator sa, SArrayIterator sa_end,
+             const DCSampler& sampler,
+             RankIterator ranks, RankIterator ranks_end)
+{
+  if (ranks_end-ranks != sampler.samplesize()) {
+    std::ostringstream os;
+    os << "dc_suffix_sort:"
+       << " wrong range size";
+    throw std::logic_error(os.str());
+  }
+      
+  suffix_mkqsort(str, str_end, sa, sa_end, sampler.period(),
+                        mark_unsorted_group2());
+      
+  SArrayIterator first = sa;
+  while (true) {
+    while (first != sa_end && *first >= 0) ++first;
+    if (first == sa_end) break;
+    *first = ~*first;
+    SArrayIterator last = first + 1;
+    while (*last >= 0) ++last;
+    *last = ~*last;
+    ++last;
+    quicksort(first, last, 
+             dc_suffix_compare<RankIterator>(ranks, sampler));
+    first = last;
+  }
+  
+}
+
+#endif // DCSUFFIXSORT_HPP
diff --git a/dcover/difference_cover.cpp b/dcover/difference_cover.cpp

new file mode 100644 (file)

index 0000000..6b6744c
--- /dev/null
+++ b/dcover/difference_cover.cpp
@@ -0,0 +1,125 @@
+/*
+ * differerence_cover.cpp
+ *
+ * Copyright 2003 Max-Planck-institut fuer Informatik
+ *
+ */
+
+//======================================================================
+// This file implements the computation of difference covers
+//======================================================================
+
+#include <algorithm>
+#include <numeric>
+#include <stdexcept>
+#include <iostream>
+
+#include "difference_cover.hpp"
+
+
+//----------------------------------------------------------------------
+// Precomputed difference covers. 
+//
+// The covers from 0 to 6 are from
+//   Luk, Wong: Two new quorum base algorithms for distributed mutual
+//   exclusion. In Proc. 17th International Conference on Distributed 
+//   Computing Systems, pp. 100-106. IEEE, 1997.
+// Covers 7 and 8 were found using an exhaustive search algorithm.
+// All except cover 8 are known to be optimal.
+//----------------------------------------------------------------------
+namespace {
+  const unsigned max_precomputed_cover = 8;
+  const unsigned coversizes[max_precomputed_cover+1] = 
+    {1,2,3,4,5,7,9,13,20};
+  const unsigned cover0[] = {0};
+  const unsigned cover1[] = {0,1};
+  const unsigned cover2[] = {0,1,2};
+  const unsigned cover3[] = {0,1,2,4};
+  const unsigned cover4[] = {0,1,2,5,8};
+  const unsigned cover5[] = {0,1,2,3,7,11,19};   //{0,7,8,10,14,19,23};
+  const unsigned cover6[] = {0,1,2,5,14,16,34,42,59};
+  const unsigned cover7[] = {0,1,3,7,17,40,55,64,75,85,104,109,117};      
+  const unsigned cover8[] = {0,1,3,7,12,20,30,44,65,80,89,96,114,122,
+                            128,150,196,197,201,219}; 
+  const unsigned * covers[] = { cover0, cover1, cover2, cover3, cover4, 
+                               cover5, cover6, cover7, cover8 };
+}
+
+
+//----------------------------------------------------------------------
+// constructor of difference_cover
+//
+// Constructs a difference cover modulo a power of two.
+// (see difference_cover.hpp)
+//
+// Small difference covers have been precomputed (see above).
+// Larger difference covers are computed by the algorithm in
+//   C.J. Colbourn and A.C.H. Ling. Quorums from Difference Covers.
+//   Information Processing Letters 75(1-2):9-12, 2000
+//----------------------------------------------------------------------
+difference_cover::difference_cover(unsigned lv)
+  : logmod(lv), mask((1<<lv)-1),
+    coverrank(1<<lv), diff2pos(1<<lv,1<<lv)
+{
+  if (logmod <= max_precomputed_cover) {
+    
+    // use precomputed covers
+    coversize = coversizes[logmod]; 
+    cover.assign(covers[logmod], covers[logmod] + coversize);
+    
+  } else {
+    
+    // use the Colbourn-Ling algorithm
+    int v = (1<<logmod);
+    int r = 0;
+    while (24*r*r+36*r+13 < v) ++r;
+    coversize = 6*r+4;
+    cover.resize(coversize);
+    std::vector<unsigned>::iterator i = cover.begin();
+    *i = 0; ++i;
+    std::fill(i, i+r,         1); i+=r;
+    *i = r+1; ++i;
+    std::fill(i, i+r,     2*r+1); i+=r;
+    std::fill(i, i+2*r+1, 4*r+3); i+=2*r+1;
+    std::fill(i, i+r+1,   2*r+2); i+=r+1;
+    std::fill(i, i+r,         1); i+=r;
+    if (i != cover.end()) {
+      throw std::logic_error("error in the Coulbourn-Ling algorithm");
+    }
+    std::partial_sum(cover.begin(), cover.end(), cover.begin());
+    
+  }
+  
+#if DEBUG > 1
+  std::cerr << "difference cover modulo " << (1<<logmod);
+  std::cerr << " of size " << coversize << ": " << std::endl;
+  std::copy(cover.begin(), cover.end(), 
+           std::ostream_iterator<unsigned>(std::cerr, " ") );
+  std::cerr << std::endl;
+#endif
+  
+  // compute the lookup tables
+  unsigned i, j, modulus=1<<logmod;
+  for (i=0, j=0; i<modulus; ++i) {
+    coverrank[i] = j;
+    if (j<coversize && cover[j] <= i) { ++j; }
+  }
+  for (i=coversize-1; i<coversize; --i) {
+    for (j=0; j<coversize; ++j) {
+      diff2pos[(cover[j]-cover[i])&mask] = cover[i];
+    }
+  }
+  
+  // check that this is a difference cover
+  for (i=0; i<modulus; ++i) {
+    if (diff2pos[i] == modulus) {
+      throw std::logic_error("bad difference cover");
+    }
+#if DEBUG > 2
+    std::cerr << i << " = " << ((diff2pos[i]+i)&mask);
+    std::cerr << " - " << diff2pos[i];
+    std::cerr << " (mod " << modulus << ")";
+    std::cerr << std::endl;
+#endif
+  }
+}
diff --git a/dcover/difference_cover.hpp b/dcover/difference_cover.hpp

new file mode 100644 (file)

index 0000000..0fe116e
--- /dev/null
+++ b/dcover/difference_cover.hpp
@@ -0,0 +1,163 @@
+/*
+ * difference_cover.hpp
+ *
+ * Copyright 2003 Max-Planck-institut fuer Informatik
+ *
+ */
+
+//======================================================================
+// This file defines two classes:
+//
+// class difference_cover implements basic difference covers
+//     modulo any power of two
+// class dc_sampler implements the difference cover based sampling
+//     used in the suffix array construction
+//
+// Part of the implementation of class difference_cover is in
+// difference_cover.cpp
+//======================================================================
+
+#ifndef DIFFERENCE_COVER_HPP
+#define DIFFERENCE_COVER_HPP
+
+#include <vector>
+#include <utility>
+//#include <stdexcept>
+
+//----------------------------------------------------------------------
+// class difference_cover 
+//
+// A difference cover D modulo v is a set of integers in the range [0,v) 
+// such that for all i in [0,v), there exists j,k in D such that 
+// i = k-j (mod v).
+//
+// This class represents a difference cover modulo any power of two.
+// The sizes of the difference covers for small v are:
+//    v   1  2  4  8  16  32  64  128  256  512  1024  ...
+//   |D|  1  2  3  4   5   7   9   13   20   28    40  ...
+// For v<=128, the size of the difference cover is minimal.
+// For any v, the size is less than sqrt(1.5*v)+6 (sqrt(v) is a lower bound).
+//
+// Notes:
+// - All arithmetic is done with unsigned int, because it guarantees
+//   correct modular arithmetic including conversion to/from other
+//   integral types.  This is a main reason why supporting values of v
+//   other than powers of two would be difficult.
+//----------------------------------------------------------------------
+class difference_cover {
+
+public:
+
+  // create a difference cover modulo 2^lv
+  explicit difference_cover(unsigned lv); // defined in difference_cover.cpp
+
+  // basic dimensions
+  unsigned modulus() const { return 1<<logmod; }
+  unsigned size() const { return coversize; }
+
+  // division by modulus
+  unsigned mod(unsigned i) const { return i&mask; }
+  template <typename IntType>
+  IntType div(IntType i) const { return i>>logmod; }
+
+  // mapping [0,modulus) <-> [0,size)
+  unsigned rank(unsigned pos) const { return coverrank[pos&mask]; }
+  unsigned pos(unsigned rank) const { return cover[rank]; }
+
+  // compute k such that i+k and j+k are both in the cover
+  unsigned offset(unsigned i, unsigned j) const {
+    return (diff2pos[(j-i)&mask]-i)&mask;
+  }
+
+  typedef std::vector<unsigned>::const_iterator iterator;
+  iterator begin() const { return cover.begin(); }
+  iterator end() const { return cover.end(); }
+
+private:
+
+  unsigned logmod;              // log v
+  unsigned mask;                // v-1 = 11...1 in binary
+  unsigned coversize;           // |D|
+
+  std::vector<unsigned> cover;          // D
+  std::vector<unsigned> coverrank;      // rank of a value in D
+  std::vector<unsigned> diff2pos;       // lookup table for finding a
+                                        // pair with a given difference
+};
+
+
+
+//----------------------------------------------------------------------
+// class dc_sampler
+//
+// Represents the periodic sample positions chosen according 
+// to a difference cover.
+//
+// The main duties are:
+// 1. Fill the sparse suffix array with the sample positions.
+// 2. Provide a mapping pack() from sample positions to packed positions.
+//    A packed position pack(i) of a sample position i is used as 
+//    the corresponding position in the inverse sparse suffix array.
+//    pack(i) must satisfy:
+//    a) pack(i) is unique (different for each sample position i) and 
+//       in the range [0,samplesize)
+//    b) The difference pack(i+v)-pack(i), where v is the length of
+//       the period (modulus of the difference cover), is the same for
+//       all i. This difference is called the packed period.
+// 3. Find the representative pair for two arbitrary positions i and j.
+//    The representative pair is a pair of positions in the inverse
+//    sparse suffix array that can be used for comparing the suffixes 
+//    i and j if they have a common prefix of length period-1. 
+//    The representative pair is computed by finding k in [0,period)
+//    such that i+k and j+k are both sample positions and returning 
+//    the packed positions corresponding to i+k and j+k.
+//----------------------------------------------------------------------    
+template <typename OffsetType>
+class dc_sampler {
+public:
+
+  dc_sampler(OffsetType s, OffsetType v) : dcover(v), fullsize_(s), 
+     samplesize_(dcover.div(s)*dcover.size()+dcover.rank(s))
+  {}
+
+  OffsetType fullsize() const { return fullsize_; }
+  OffsetType samplesize() const { return samplesize_; }
+  OffsetType period() const { return dcover.modulus(); }
+  OffsetType packed_period() const { return dcover.size(); }
+       
+  OffsetType pack(OffsetType i) const {
+      return dcover.div(i) * dcover.size() + dcover.rank(i);
+  }
+
+  // representative pair
+  std::pair<OffsetType,OffsetType> 
+  pair(OffsetType i, OffsetType j) const {
+    OffsetType k = dcover.offset(i, j);
+    return std::make_pair(pack(i+k), pack(j+k));
+  }
+
+  // fills a range with the sample positions
+  template <typename OutputIterator>
+  OutputIterator fill(OutputIterator it) const {
+    difference_cover::iterator j = dcover.begin();
+    OffsetType i = 0;
+    OffsetType pos = i + static_cast<OffsetType>(*j);
+    while (pos < fullsize()) {
+      *it++ = pos;
+      if (++j == dcover.end()) {
+       i += dcover.modulus();
+       j = dcover.begin();
+      }
+      pos = i + static_cast<OffsetType>(*j);
+    }
+    return it;
+  }
+
+private:
+
+  const difference_cover dcover;
+  const OffsetType fullsize_;   // size of the range where samples are chosen
+  const OffsetType samplesize_;
+};
+
+#endif // DIFFERENCE_COVER_HPP
diff --git a/dcover/distribute.hpp b/dcover/distribute.hpp

new file mode 100644 (file)

index 0000000..04aea51
--- /dev/null
+++ b/dcover/distribute.hpp
@@ -0,0 +1,405 @@
+/*
+ * distribute.hpp
+ *
+ * Copyright 2004 Juha K"arkk"ainen <juha.karkkainen@cs.helsinki.fi>
+ *
+ */
+
+#ifndef DISTRIBUTE_HPP
+#define DISTRIBUTE_HPP
+
+#include "dcsuffixsort.hpp"
+
+#include <iterator>
+#include <algorithm>
+#include <utility>
+
+
+//----------------------------------------------------------------------
+// suffix_lcp
+//
+// Compute the length of the longest common prefix between two
+// suffixes a and b. 
+// str_end = end of the text
+// minlcp = length of the already known common prefix
+// maxlcp = stop when the length reaches maxlcp
+//----------------------------------------------------------------------
+template <class StringIterator, class LcpType>
+LcpType
+suffix_lcp(StringIterator a, StringIterator b, StringIterator str_end,
+          LcpType minlcp, LcpType maxlcp)
+{
+  if (a < b) swap(a,b);   // make a the shorter suffix
+  if ((str_end-a) > maxlcp) str_end = a + maxlcp;
+  std::pair<StringIterator,StringIterator> mm =
+             std::mismatch(a+minlcp, str_end, b+minlcp);
+  //std::cout << " lcp=" << mm.first-a << "\n";
+  return static_cast<LcpType>(mm.first-a);
+}
+
+
+//----------------------------------------------------------------------
+// class suffix_distributor
+//----------------------------------------------------------------------
+template <class StringIterator, class SuffixIterator,
+         class DCSampler, class RankIterator>
+class suffix_distributor
+{
+private:
+  //typedef typename std::iterator_traits<StringIterator>::difference_type 
+  //  lcptype; 
+  typedef short lcptype;
+  typedef typename 
+    std::iterator_traits<SuffixIterator>::difference_type difftype; 
+
+
+  StringIterator str, str_end;
+  SuffixIterator pivots, pivots_end;
+  std::vector<lcptype> lcps;
+  const DCSampler& dcsampler;
+  RankIterator dcranks;
+  lcptype maxlcp;
+
+  typedef typename std::vector<lcptype>::iterator lcpiterator;
+
+public:
+
+  suffix_distributor(StringIterator str_, StringIterator str_end_,
+                    SuffixIterator pivots_, SuffixIterator pivots_end_,
+                    const DCSampler& dcsampler_, RankIterator dcranks_)
+    : str(str_), str_end(str_end_), pivots(pivots_), pivots_end(pivots_end_),
+      lcps(pivots_end-pivots), dcsampler(dcsampler_), dcranks(dcranks_),
+    maxlcp(dcsampler.period())
+  {
+    compute_lcps();
+  }
+
+  void reset(SuffixIterator pivots_, SuffixIterator pivots_end_)
+  {
+    pivots=pivots_; 
+    pivots_end=pivots_end_;
+    lcps.resize(pivots_end-pivots);
+
+    compute_lcps();
+  }
+  
+
+  difftype nbuckets() const{ return pivots_end-pivots+1; }
+
+  difftype find_bucket(StringIterator suf) const {
+    return binary_search(suf);
+    //return i - pivots;
+  }
+
+  void
+  print() const {
+#if DEBUG > 1
+    std::cout << "distributor memory=" 
+             << lcps.size()*sizeof(lcptype)
+             << std::endl;
+#if DEBUG > 2
+    std::cout << "pivots and lcps:\n";
+    for (SuffixIterator i = pivots; i != pivots_end; ++i) {
+      std::cout << std::setw(4) << i-pivots
+               << std::setw(4) << int(lcps[i-pivots])
+               << std::setw(9) << *i << " ";
+      std::copy(str+*i, std::min(str_end, str+*i+40),
+               std::ostream_iterator<char>(std::cout,""));
+      std::cout << "\n";
+    }
+    std::cout << "distributor end\n";
+#endif
+#endif
+  }
+
+
+private:
+
+  bool dc_less_or_equal(difftype a, difftype b) const {
+    std::pair<difftype,difftype> rep = dcsampler.pair(a, b);
+    return dcranks[rep.first] <= dcranks[rep.second];
+  }
+
+  //----------------------------------------------------------------------
+  // compute_lcps_bounded
+  // 
+  // Compute all the lcp values in a subrange bounded by
+  // actual pivot suffixes.
+  //----------------------------------------------------------------------
+  void
+  compute_lcps_bounded(SuffixIterator left_suf, SuffixIterator right_suf,
+                     lcpiterator left_lcp, 
+                     lcptype minlcp, lcptype maxlcp)
+  {
+    if (left_suf == right_suf) return;
+    difftype midpoint = (right_suf-left_suf)/2;
+    //std::cout << "  midpoint of [0," << right_suf-left_suf 
+    //          << ")=" << midpoint << "\n";
+    
+    SuffixIterator mid_suf = left_suf + midpoint;
+    lcpiterator mid_lcp = left_lcp + midpoint;
+    
+    lcptype llcp_left = suffix_lcp(str+*(left_suf-1), str+*mid_suf, 
+                                  str_end, minlcp, maxlcp);
+    lcptype llcp_right = suffix_lcp(str+*mid_suf, str+*right_suf, 
+                                   str_end, minlcp, maxlcp);
+    *mid_lcp = llcp_right - llcp_left;
+    //std::cout << *mid_lcp << std::endl;
+
+    compute_lcps_bounded(left_suf, mid_suf, left_lcp, llcp_left, maxlcp);
+    compute_lcps_bounded(mid_suf+1, right_suf, mid_lcp+1, llcp_right, maxlcp);
+  }
+  
+  
+  //----------------------------------------------------------------------
+  // compute_lcps
+  //
+  // Compute all the lcp values in the full range.
+  //----------------------------------------------------------------------
+  void
+  compute_lcps()
+  {
+    if (pivots == pivots_end) return;
+    difftype centerpoint = (pivots_end - pivots)/2;
+    //std::cout << "midpoint of [0," << (pivots_end - pivots) << ")="
+    //      << centerpoint << "\n";
+    lcps[centerpoint] = 0;
+    
+    // unbounded ranges on the left
+    difftype rightpoint = centerpoint;
+    while (rightpoint > 0) {
+      difftype midpoint = rightpoint/2;
+      //std::cout << "midpoint of [0," << rightpoint << ")=" << midpoint << "\n";
+      lcptype llcp = suffix_lcp(str+pivots[midpoint], 
+                              str+pivots[rightpoint], 
+                              str_end, static_cast<lcptype>(0), maxlcp);
+      lcps[midpoint] = llcp;
+      //std::cout << llcp << std::endl;
+      compute_lcps_bounded(pivots+midpoint+1, pivots+rightpoint, 
+                          lcps.begin()+midpoint+1, llcp, maxlcp);
+      rightpoint = midpoint;
+    }
+    
+    // unbounded ranges on the right
+    SuffixIterator left_suf = pivots + centerpoint + 1;
+    lcpiterator left_lcp = lcps.begin() + centerpoint + 1;
+    while (left_suf != pivots_end) {
+      difftype midpoint = (pivots_end - left_suf)/2;
+      //std::cout << "midpoint of [" << (left_suf-pivots) << "," << 
+      //  (pivots_end-pivots) << ")=" << (left_suf-pivots)+midpoint << "\n";
+      lcptype llcp = suffix_lcp(str+left_suf[-1], str+left_suf[midpoint], 
+                              str_end, static_cast<lcptype>(0), maxlcp);
+      left_lcp[midpoint] = -llcp;
+      //std::cout << -llcp << std::endl;
+      compute_lcps_bounded(left_suf, left_suf+midpoint, left_lcp, 
+                          llcp, maxlcp);
+      left_suf += midpoint + 1;
+      left_lcp += midpoint + 1;
+    }
+    
+  }
+  
+
+
+#ifndef BINSEARCH
+  
+//----------------------------------------------------------------------
+// binary_search
+//----------------------------------------------------------------------
+difftype
+//SuffixIterator
+binary_search(StringIterator query) const
+{
+  //SuffixIterator left = pivots;
+  //SuffixIterator right = pivots_end;
+  //SuffixIterator mid;
+
+  difftype left = 0;
+  difftype right = pivots_end-pivots;
+  difftype mid;
+
+  lcptype minlcp = 0;
+  lcptype lcp_diff = 0;
+  lcptype lcp_left, lcp_right;
+
+ in_the_middle:
+  while (true) {
+    /*
+    std::cout << "middle minlcp=" << int(minlcp)
+             << " lcp_diff=" << int(lcp_diff)
+             << " mid=" << left + (right-left)/2
+             << "\n";
+    */
+    if (left == right) return left;
+    mid = left + (right-left)/2;
+    if (str+pivots[mid] == query) return mid+1;
+    lcptype lcp_mid = lcps[mid];
+    if (lcp_mid == 0) {
+      lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end, 
+                                 minlcp, maxlcp);
+      if (newlcp == maxlcp) {
+       lcp_left = lcp_right = minlcp;
+       goto fullmatch;
+      }
+      if (query+newlcp == str_end ||
+         query[newlcp] < str[pivots[mid]+newlcp]) {
+       right = mid;
+       lcp_diff = newlcp - minlcp;
+       if (lcp_diff != 0) goto closer_to_right;
+      } else {
+       left = mid+1;
+       lcp_diff = minlcp - newlcp;
+       if (lcp_diff != 0) goto closer_to_left;
+      }
+    } else {
+      if (lcp_mid < lcp_diff) {
+       left = mid+1;
+      } else {
+       right = mid;
+      }
+    }
+  }
+
+ closer_to_left:
+  while (true) {
+    /*
+    std::cout << "left minlcp=" << int(minlcp)
+             << " lcp_diff=" << int(lcp_diff)
+             << " mid=" << left + (right-left)/2
+             << "\n";
+    */
+    if (left == right) return left;
+    mid = left + (right-left)/2;
+    if (str+pivots[mid] == query) return mid+1;
+    lcptype lcp_mid = lcps[mid];
+    if (lcp_mid > lcp_diff) {
+      if (lcp_mid < 0) {
+       lcp_diff -= lcp_mid;
+       minlcp -= lcp_mid;
+      }
+      right = mid;
+    } else {
+      if (lcp_mid == lcp_diff) {
+       lcptype lcp = minlcp - lcp_diff;
+       lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end, 
+                                   lcp, maxlcp);
+       if (newlcp == maxlcp) {
+         lcp_left = lcp;
+         lcp_right = minlcp;
+         goto fullmatch;
+       }
+       if (query+newlcp == str_end ||
+           query[newlcp] < str[pivots[mid]+newlcp]) {
+         right = mid;
+         minlcp = lcp;
+         lcp_diff = newlcp - minlcp;
+         if (lcp_diff == 0) goto in_the_middle;
+         else goto closer_to_right;
+       } else {
+         left = mid+1;
+         lcp_diff = minlcp - newlcp;
+       }
+      } else {
+       left = mid+1;
+      }
+    }
+  }
+
+ closer_to_right:
+  while (true) {
+    /*
+    std::cout << "right minlcp=" << int(minlcp)
+             << " lcp_diff=" << int(lcp_diff)
+             << " mid=" << left + (right-left)/2
+             << "\n";
+    */
+    if (left == right) return left;
+    mid = left + (right-left)/2;
+    if (str+pivots[mid] == query) return mid+1;
+    lcptype lcp_mid = lcps[mid];
+    if (lcp_mid < lcp_diff) {
+      if (lcp_mid > 0) {
+       lcp_diff -= lcp_mid;
+       minlcp += lcp_mid;
+      }
+      left = mid+1;
+    } else {
+      if (lcp_mid == lcp_diff) {
+       lcptype lcp = minlcp + lcp_diff;
+       lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end, 
+                                   lcp, maxlcp);
+       if (newlcp == maxlcp) {
+         lcp_left = minlcp;
+         lcp_right = lcp;
+         goto fullmatch;
+       }
+       if (query+newlcp == str_end ||
+           query[newlcp] < str[pivots[mid]+newlcp]) {
+         right = mid;
+         lcp_diff = newlcp - minlcp;
+       } else {
+         left = mid+1;
+         minlcp = lcp;
+         lcp_diff = minlcp - newlcp;
+         if (lcp_diff == 0) goto in_the_middle;
+         else goto closer_to_left;
+       }
+      }
+      right = mid;
+    }
+  }
+
+ fullmatch:
+  // find the range of fully matching pivots
+  //std::cout << "fullmatch: left\n";
+  {
+    difftype right_tmp = mid;
+    while (left != right_tmp) {
+      difftype mid = left + (right_tmp-left)/2;
+      //std::cout << left << mid << right_tmp << std::endl;
+      difftype lcp_diff = lcps[mid];
+      //std::cout << " " << lcp_left << " " << lcp_diff << std::endl;
+      if (lcp_left + lcp_diff == maxlcp) {
+       right_tmp = mid;
+      } else {
+       if (lcp_diff > 0) { lcp_left += lcp_diff; }
+       left = mid+1;
+      }
+    }
+  }
+  //std::cout << "fullmatch: right\n";
+  {
+    difftype left_tmp = mid+1;
+    while (left_tmp != right) {
+      difftype mid = left_tmp + (right-left_tmp)/2;
+      //std::cout << left_tmp << mid << right << std::endl;
+      difftype lcp_diff = lcps[mid];
+      //std::cout << " " << lcp_right << " " << lcp_diff << std::endl;
+      if (lcp_right - lcp_diff == maxlcp) {
+        left_tmp = mid+1;
+      } else {
+       if (lcp_diff < 0) { lcp_right -= lcp_diff; }
+       right = mid;
+      }
+    }
+  }
+  //std::cout << "fullmatch: dcsearch\n";
+  while (left != right) {
+    difftype mid = left + (right-left)/2;
+    //std::cout << left << mid << right << std::endl;
+    if (dc_less_or_equal(pivots[mid], query-str)) {
+      left = mid+1;
+    } else {
+      right = mid;
+    }
+  }
+  return left;
+  
+}
+
+#endif
+
+};
+
+
+#endif // DISTRIBUTE_HPP
diff --git a/dcover/doubling.hpp b/dcover/doubling.hpp

new file mode 100644 (file)

index 0000000..3f8b29e
--- /dev/null
+++ b/dcover/doubling.hpp
@@ -0,0 +1,204 @@
+/*
+ * doubling.hpp
+ *
+ * Copyright 2003 Max-Planck-Institut fuer Informatik
+ *
+ */
+
+//======================================================================
+// Doubling algorithm for suffix array construction.
+// Follows closely the algorithm described in 
+//
+//   N.J. Larsson and K. Sadakane. Faster Suffix Sorting.
+//   Technical Report LU-CS-TR:99-214, Dept. of Computer Science,
+//   Lund University, Sweden, 1999.
+//======================================================================
+
+#ifndef DOUBLING_HPP
+#define DOUBLING_HPP
+
+#include "partition.hpp"
+
+#include <iterator>
+#include <algorithm>
+#include <utility>
+#include <sstream>
+#include <stdexcept>
+
+//----------------------------------------------------------------------
+// functor isa_access
+//
+// Given a suffix array iterator computes the corresponding
+// inverse suffix array entry to be used in comparisons.
+//----------------------------------------------------------------------
+template <typename ISAIterator, typename SAIterator>
+class isa_access
+  : public std::unary_function<SAIterator,
+          typename std::iterator_traits<ISAIterator>::value_type>
+{
+private:
+  typedef typename std::iterator_traits<ISAIterator>::difference_type
+          difference_type;
+  const ISAIterator isa;
+public:
+  isa_access(ISAIterator s, difference_type lcp) : isa(s+lcp) {}
+  typename std::iterator_traits<ISAIterator>::value_type
+  operator() (SAIterator i) const {
+    return isa[*i]; 
+  }
+};
+
+//----------------------------------------------------------------------
+// function update_group
+//
+// Update a group after doubling.
+// WARNING: cannot handle an empty group
+//----------------------------------------------------------------------
+template <typename SAIterator, typename ISAIterator>
+void update_group(SAIterator beg, SAIterator end,
+                 SAIterator sa, ISAIterator isa)
+{
+  typedef typename std::iterator_traits<SAIterator>::value_type
+    pos_type;
+  pos_type groupnumber = std::distance(sa, end) - 1;
+  if (end-beg > 1) {
+    for (SAIterator i = beg; i != end; ++i) {
+      isa[*i] = groupnumber;
+    }
+  } else {
+    isa[*beg] = groupnumber;
+    *beg = -1;
+  }
+}
+
+
+//----------------------------------------------------------------------
+// function selection_partition
+//
+// Performs doubling partitioning for small groups.
+//----------------------------------------------------------------------
+template <typename SAIterator, typename ISAIterator>
+void selection_partition(SAIterator sa, ISAIterator isa,
+                        SAIterator beg, SAIterator end, 
+         typename std::iterator_traits<SAIterator>::difference_type lcp)
+{
+  typedef typename std::iterator_traits<SAIterator>::value_type
+    pos_type;
+  isa_access<ISAIterator, SAIterator> key(isa, lcp);
+  while (end-beg > 1) {
+    SAIterator min_end = beg+1;
+    pos_type min_key = key(beg);
+    for (SAIterator i = beg+1; i != end; ++i) {
+      if (key(i) < min_key) {
+       std::swap(*beg, *i);
+       min_end = beg+1;
+       min_key = key(beg);
+      } else if (!(min_key < key(i))) {
+       std::swap(*min_end, *i);
+       ++min_end;
+      }
+    }
+    update_group(beg, min_end, sa, isa);
+    beg = min_end;
+  }
+  if (beg != end) update_group(beg, end, sa, isa);
+}
+
+
+//----------------------------------------------------------------------
+// function doubling_partition
+//
+// Given a group of suffixes with a common prefix of length lcp
+// partition it to subgroups according to the next lcp characters.
+// The next lcp characters are represented by a single value
+// in the inverse suffix array.
+//----------------------------------------------------------------------
+template <typename SAIterator, typename ISAIterator>
+void doubling_partition(SAIterator sa, ISAIterator isa,
+                       SAIterator beg, SAIterator end, 
+          typename std::iterator_traits<SAIterator>::difference_type lcp)
+{
+  typedef typename std::iterator_traits<SAIterator>::value_type
+    pos_type;
+
+  while (end-beg > 10) {
+
+    // ternary partition
+    isa_access<ISAIterator, SAIterator> get_isa_entry(isa, lcp);
+    SAIterator pivot = random_pivot(beg, end, std::less<pos_type>(), 
+                                   get_isa_entry);
+    std::pair<SAIterator, SAIterator> midrange
+      = ternary_partition(beg, end, pivot, std::less<pos_type>(), 
+                         get_isa_entry);
+    
+    // recurse on <-part
+    if (beg != midrange.first) {
+      doubling_partition(sa, isa, beg, midrange.first, lcp);
+    }
+    
+    // update =-part
+    update_group(midrange.first, midrange.second, sa, isa);
+    
+    // loop on >-part
+    beg = midrange.second;
+  }
+
+  selection_partition(sa, isa, beg, end, lcp);
+}
+
+
+
+//======================================================================
+// function doubling_sort
+//
+// The main doubling algorithm
+//======================================================================
+template <typename SAIterator, typename ISAIterator>
+void
+doubling_sort(SAIterator sa, SAIterator sa_end,
+             ISAIterator isa, ISAIterator isa_end,
+         typename std::iterator_traits<ISAIterator>::difference_type lcp)
+{
+  typedef typename std::iterator_traits<SAIterator>::difference_type 
+    difference_type;
+
+  difference_type size = std::distance(sa, sa_end);
+  if (isa_end-isa != size) {
+    std::ostringstream os;
+    os << "doubling_sort:"
+       << " wrong range size";
+    throw std::logic_error(os.str());
+  }
+
+  while (*sa > -size) {
+
+    SAIterator beg = sa;
+    difference_type nrunlen = 0;
+    while (beg != sa_end) {
+      difference_type val = *beg;
+      if (val < 0) {
+       // jump over already sorted part
+       beg -= val;
+       nrunlen += val;
+      } else {
+       // combine preceding sorted parts
+       if (nrunlen < 0) {
+         *(beg+nrunlen) = nrunlen;
+         nrunlen = 0;
+       }
+       // partition unsorted group
+       SAIterator end = sa + isa[val] + 1;
+       doubling_partition(sa, isa, beg, end, lcp);
+       beg = end;
+      }
+    }
+    if (nrunlen < 0) {
+      *(beg+nrunlen) = nrunlen;
+    }
+
+    lcp *= 2;
+  }
+}
+
+
+#endif // DOUBLING_HPP
diff --git a/dcover/kmpsplit.hpp b/dcover/kmpsplit.hpp

new file mode 100644 (file)

index 0000000..77e7d99
--- /dev/null
+++ b/dcover/kmpsplit.hpp
@@ -0,0 +1,196 @@
+
+#include <vector>
+#include <iostream>
+#include <iterator>
+
+template <class Iterator, class DCSampler, class RankIterator>
+class kmp_split {
+public:
+
+  kmp_split(Iterator s, Iterator se, Iterator p, 
+           const DCSampler& d, RankIterator r)
+    : str(s), str_end(se), suf(str), pivot(p), pivot_end(p+d.period()-1), 
+      last(1), last_lcp(0), dcsampler(d), dcranks(r)
+  { 
+    preprocess(); 
+  }
+
+private:
+
+  bool dc_less(Iterator a, Iterator b) const {
+    std::pair<int,int> rep = dcsampler.pair(a-str, b-str);
+    return dcranks[rep.first] <= dcranks[rep.second];
+  }
+
+public:
+
+  bool is_next_less() {
+    // state:
+    // suf == str_end ||
+    // last_lcp == lcp((suf-last), pivot) &&
+    // last_less == (suf-last)+last_lcp == str_end || 
+    //              (suf-last)[last_lcp] < pivot[last_lcp]
+
+    bool result = true;
+    if (suf==str_end) return true;
+    if (suf==pivot) {
+      last_lcp = pivot_end-pivot;
+      last_less = false;
+      ++suf; last=1;
+      return false;
+    }
+
+    /*
+    std::cout << " last=" << last
+             << " last_lcp=" << last_lcp
+             << " last+lcp[last]=" << last+lcp[last]
+             << " last_less=" << last_less
+             << " less[last]=" << bool(less[last]);
+    */
+
+    if (last_lcp < last || last_lcp == last+lcp[last]) {
+      int lcp;
+      if (last_lcp < last) lcp = 0;
+      else lcp = last_lcp - last;
+      while (suf+lcp != str_end && pivot+lcp != pivot_end
+            && suf[lcp]==pivot[lcp]) {
+       ++lcp;
+      }
+      /*
+      std::cout << " lcp=" << lcp
+               << " suf+lcp-str=" << suf+lcp-str
+               << " str_end-str=" << str_end-str
+               << " suf+lcp==str=" << (suf+lcp==str);
+      */
+      result = (suf+lcp==str_end) 
+       || (pivot+lcp==pivot_end ? dc_less(suf, pivot) : suf[lcp] < pivot[lcp]);
+      //std::cout << " result=" << result << std::endl;
+      last_lcp = lcp;
+      last_less = result;
+      ++suf; last = 1;
+    } else if (last_lcp > last+lcp[last]) {
+      result = !less[last];
+      ++suf; ++last;
+    } else if (last_lcp < last+lcp[last]) {
+      result = last_less;
+      last_lcp -= last;
+      ++suf; last = 1;
+    }
+
+    //std::cout << " result=" << result << std::endl;
+    return result;
+  }
+
+  //81   0      212 cabcbacacabcbacacabcbacacabcbacacabcbaca
+
+private:
+  Iterator str;
+  Iterator str_end;
+  Iterator suf;
+  Iterator pivot;
+  Iterator pivot_end;
+
+  int last;
+  int last_lcp;
+  bool last_less;
+
+  std::vector<int> lcp;
+  std::vector<char> less;
+
+  const DCSampler& dcsampler;
+  RankIterator dcranks;
+
+  void preprocess() {
+
+    int length = pivot_end-pivot;
+
+    int i = 0;
+    int j = 1;
+
+    /*
+    std::vector<int> shift(length+1);
+    shift[0] = -1;
+
+    while (true) {
+      while ( j < length && pivot[i] == pivot[j] ) {
+       shift[j] = shift[i];
+       ++i; ++j;
+      }
+      shift[j] = i;
+      if (j == length) break;
+      do {
+       i = shift[i];
+      } while ( i != -1 && pivot[i] != pivot[j] );
+      if (i==-1) { ++i; ++j; }
+    }
+
+    std::copy (shift.begin(), shift.end(),
+              std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+    */
+
+    lcp.resize(length+1);
+    less.resize(length+1);
+
+    lcp[0] = length;
+    less[0] = 0;
+    i = 0; j = 1;
+
+    while (j <= length) {
+      while ( j < length && pivot[i] == pivot[j] ) {
+       ++i; ++j;
+      }
+      if ( j == length || pivot[i] < pivot[j] ) {
+       lcp[j-i] = i;
+       less[j-i] = 1;
+      } else {
+       lcp[j-i] = i;
+       less[j-i] = 0;
+      }
+      int k = i;
+      do {
+       --i;
+       if (i == -1) {
+         ++i; ++j;
+         break;
+       } else if (lcp[k-i] == i) {
+         break;
+       } else if (lcp[k-i] < i) {
+         lcp[j-i] = lcp[k-i];
+         less[j-i] = less[k-i];
+       } else {
+         lcp[j-i] = i; // i == lcp[j-k]
+         less[j-i] = less[j-k];
+       }
+      } while (true);
+    }
+
+    /*
+    std::copy (lcp.begin(), lcp.end(),
+              std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+    std::copy (less.begin(), less.end(),
+              std::ostream_iterator<int>(std::cout, " "));
+    std::cout << std::endl;
+    */  
+  }
+
+
+};
+
+
+/*
+
+gagcgag
+gagcgag 7 0
+ g      0 0
+  ga    1 1
+   g    0 0
+    gag 3 1
+     g  0 0
+      g 1 1
+        0 1
+
+aagagcgagta
+             
+*/
diff --git a/dcover/partition.hpp b/dcover/partition.hpp

new file mode 100644 (file)

index 0000000..6b72c94
--- /dev/null
+++ b/dcover/partition.hpp
@@ -0,0 +1,435 @@
+/*
+ * partition.hpp
+ *
+ * Copyright 2003 Max-Planck-Institut fuer Informatik
+ *
+ */
+
+//======================================================================
+// A collection of flexible, generic, STL-like algorithm components 
+// for comparison-based distribution (i.e., quicksort-like) sorting.
+//======================================================================
+
+#ifndef PARTITION_HPP
+#define PARTITION_HPP
+
+#include <algorithm>
+#include <utility>
+#include <functional>
+#include <iterator>
+#include <cstdlib>   // for rand()
+
+//======================================================================
+// Most of the algorithms here have three versions. For example a sort
+// function can have the following forms:
+//   sort(beg, end);
+//   sort(beg, end, less);
+//   sort(beg, end, less, key);
+// The first two forms are standard STL. The third form adds an
+// argument key, which is a unary function.  Where the second
+// form would compare two iterators i and j as less(*i, *j),
+// the third form does less(key(i), key(j)) instead. This allows
+// moving some of the computation related to the comparison from
+// less() into key().  When the same iterator is used in many
+// comparisons (for example the pivot), key() needs to be evaluated
+// only once. 
+// 
+// For example, to sort objects of type T with a member function
+// size() returning int into increasing order of size() do
+// (TODO: test/expand this, use vector of vectors)
+//
+//   sort(beg, end, std::less<unsigned>(), std::mem_fun(&T::size));
+//
+// key() may return a pointer or reference to (a member of) the object.
+// All the algorithms assume that the value returned by key() stays
+// valid only as long as the object is not moved.
+//
+// TYPE REQUIREMENTS
+// For example:
+// template <typename Iterator, typename Compare, typename KeyAccess>
+// void sort(Iterator, Iterator, Compare, KeyAccess);
+//  * KeyAccess is a model of Adaptable Unary Function
+//  * Iterator is convertible to KeyAccess::argument_type
+//  * KeyAccess::result_type is convertible to the argument type
+//    of Compare.
+//======================================================================
+
+//----------------------------------------------------------------------
+// functor copy_dereference
+// functor const_dereference
+// 
+// Default functors to be used as key().
+// (TODO: which one should be used as the default?)
+// (TODO: is const_dereference::result_type a problem?)
+//----------------------------------------------------------------------
+template <typename Iterator>
+class const_dereference
+  : public std::unary_function<Iterator, 
+          const typename std::iterator_traits<Iterator>::value_type &>
+{
+public:
+  const typename std::iterator_traits<Iterator>::value_type &
+  operator() (Iterator i) const { return *i; }
+};
+
+template <typename Iterator>
+class copy_dereference 
+  : public std::unary_function<Iterator, 
+                typename std::iterator_traits<Iterator>::value_type>
+{
+public:
+  typename std::iterator_traits<Iterator>::value_type
+  operator() (Iterator i) const { return *i; }
+};
+
+template <typename Iterator>
+class default_key : public copy_dereference<Iterator> {};
+
+
+
+//======================================================================
+//
+// Pivot selection
+//
+// (TODO: non-randomized pivot selectors)
+//======================================================================
+
+//----------------------------------------------------------------------
+// function iter_median3
+//
+// Median of three
+//----------------------------------------------------------------------
+template <typename Iterator, typename Compare, typename KeyAccess>
+Iterator
+iter_median3(Iterator a, Iterator b, Iterator c, 
+            Compare less, KeyAccess key) {
+  typedef typename KeyAccess::result_type key_type;
+  key_type ka = key(a), kb = key(b), kc = key(c);
+  if (less(ka, kb))
+    if (less(kb, kc)) return b;
+    else if (less(ka, kc)) return c;
+    else return a;
+  else if (less(ka, kc)) return a;
+  else if (less(kb, kc)) return c;
+  else return b;
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename Iterator, typename Compare>
+Iterator
+iter_median3(Iterator a, Iterator b, Iterator c, Compare less) {
+  return iter_median3(a, b, c, less, default_key<Iterator>());
+}
+
+template <typename Iterator>
+Iterator
+iter_median3(Iterator a, Iterator b, Iterator c) {
+  typedef typename std::iterator_traits<Iterator>::value_type key_type;
+  return iter_median3(a, b, c, std::less<key_type>(), 
+                     default_key<Iterator>());
+}
+
+
+//----------------------------------------------------------------------
+// function random_pivot
+//
+// Choose pivot of as a median of three random elements
+// (or as a single random element for small ranges)
+//
+// (TODO: better/faster random number generation?)
+//----------------------------------------------------------------------
+template <typename RandomAccessIterator, typename Compare, 
+         typename KeyAccess>
+RandomAccessIterator
+random_pivot (RandomAccessIterator beg, RandomAccessIterator end,
+             Compare less, KeyAccess key)
+{
+  static double scale_to_one = 1.0L/(RAND_MAX+1.0);
+
+  typedef typename std::iterator_traits<RandomAccessIterator>::difference_type 
+    difference_type;
+  difference_type size = std::distance(beg, end);
+  double scale_to_size = size * scale_to_one;
+  RandomAccessIterator pivot;
+  pivot = beg + static_cast<difference_type>(scale_to_size*std::rand());
+  if (size > 50) {
+    RandomAccessIterator pivot2, pivot3;
+    pivot2 = beg + static_cast<difference_type>(scale_to_size*std::rand());
+    pivot3 = beg + static_cast<difference_type>(scale_to_size*std::rand());
+    pivot = iter_median3(pivot, pivot2, pivot3, less, key);
+  }
+  return pivot;
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename RandomAccessIterator, typename Compare>
+RandomAccessIterator
+random_pivot(RandomAccessIterator beg, RandomAccessIterator end,
+                   Compare less)
+{
+  return random_pivot(beg, end, less,
+                     default_key<RandomAccessIterator>());
+}
+
+template <typename RandomAccessIterator>
+RandomAccessIterator
+random_pivot(RandomAccessIterator beg, RandomAccessIterator end)
+{
+  typedef typename std::iterator_traits<RandomAccessIterator>::value_type
+    key_type;
+  return random_pivot(beg, end, std::less<key_type>(),
+                     default_key<RandomAccessIterator>());
+}
+
+
+
+
+//======================================================================
+// Partitioning
+//======================================================================
+
+//----------------------------------------------------------------------
+// function binary_partition
+//
+// Partitions the input range [beg,end) into two parts [beg,mid) and
+// [mid,end) with all elements smaller than pivot in [beg, mid) and
+// all elements larger than pivot in [mid,end). Equal elements may
+// be in either subrange. Returns mid.
+// PRECONDITION: pivot is in the range [beg,end)
+//----------------------------------------------------------------------
+template <typename RandomAccessIterator, typename Compare, 
+         typename KeyAccess>
+RandomAccessIterator
+binary_partition(RandomAccessIterator beg, RandomAccessIterator end,
+                RandomAccessIterator pivot, Compare less, KeyAccess key)
+{
+  if (beg == --end) return beg; 
+
+  // move the pivot to a safe place and setup sentinels
+  std::swap(*beg, *pivot);
+  if (less(key(end), key(beg))) {
+    std::swap(*beg, *end);
+    pivot = end;
+  } else {
+    pivot = beg;
+  }
+
+  // now the pivot won't move anymore
+  typename KeyAccess::result_type pivot_key = key(pivot);
+
+  while (true) {
+    do { ++beg; } while (less(key(beg), pivot_key));
+    do { --end; } while (less(pivot_key, key(end)));
+    if (!(beg < end)) return beg;
+    std::swap(*beg, *end);
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename RandomAccessIterator, typename Compare>
+RandomAccessIterator
+binary_partition(RandomAccessIterator beg, RandomAccessIterator end,
+                RandomAccessIterator pivot, Compare less)
+{
+  return binary_partition(beg, end, pivot, less, 
+                         default_key<RandomAccessIterator>());
+}
+
+template <typename RandomAccessIterator>
+RandomAccessIterator
+binary_partition(RandomAccessIterator beg, RandomAccessIterator end,
+                RandomAccessIterator pivot)
+{
+  typedef typename std::iterator_traits<RandomAccessIterator>::value_type
+    key_type;
+  return binary_partition(beg, end, pivot, std::less<key_type>(), 
+                         default_key<RandomAccessIterator>());
+}
+
+
+//----------------------------------------------------------------------
+// function ternary_partition
+//
+// Partitions the input range [beg,end) into three parts according to
+// comparisons to the pivot:  | less | equal | greater |
+// Returns the middle (equal) range.
+// PRECONDITION: pivot is in the range [beg,end)
+//
+// METHOD
+// The algorithm maintains the following invariant:
+//
+// | equal  |  less  |  unknown  | greater | equal |
+//  ^        ^        ^         ^         ^         ^
+// beg  first_less   left     right  last_greater  end
+//
+//----------------------------------------------------------------------
+template <typename RandomAccessIterator, typename Compare,
+         typename KeyAccess>
+std::pair<RandomAccessIterator,RandomAccessIterator>
+ternary_partition(RandomAccessIterator beg, RandomAccessIterator end,
+                 RandomAccessIterator pivot, Compare less,
+                 KeyAccess key)
+{
+  RandomAccessIterator left = beg, right = end-1;
+  RandomAccessIterator first_less = left, last_greater = right;
+
+  // first move pivot to a safe place and setup sentinels
+  std::iter_swap(pivot, left);
+  if (less(key(right), key(left))) {
+    std::iter_swap(left, right);
+    pivot = right;
+    ++left; --right; --last_greater;
+  } else {
+    pivot = left;
+    ++left; ++first_less;
+  }
+
+  // now the pivot won't move anymore
+  typename KeyAccess::result_type pivot_key = key(pivot);
+
+  while(true) {
+    while(less(key(left), pivot_key)) ++left;
+    while (!less(pivot_key, key(left)) && left < right) {
+      std::iter_swap(left++, first_less++);
+      while(less(key(left), pivot_key)) ++left;
+    }
+    while(less(pivot_key, key(right))) --right;
+    while (!less(key(right), pivot_key) && left < right) {
+      std::iter_swap(right--, last_greater--);
+      while(less(pivot_key, key(right))) --right;
+    }
+    if (!(left < right)) break;
+    std::iter_swap(left++, right--);
+  }
+
+  // now the situation is this:
+  //
+  // | equal  |     less     |    greater   | equal |
+  //  ^        ^            ^ ^            ^         ^
+  // beg  first_less    right left    last_greater  end
+  //
+  // or this:
+  //
+  // | equal  |     less    |=|   greater   | equal |
+  //  ^        ^             ^             ^         ^
+  // beg  first_less       left       last_greater  end
+  //                       right
+  //
+  // next: swap equal parts to the middle
+
+  typename std::iterator_traits<RandomAccessIterator>::difference_type size;
+  size = std::min(first_less-beg, left-first_less);
+  std::swap_ranges(beg, beg+size, left-size);
+  size = std::min(end-1-last_greater, last_greater-right);
+  std::swap_ranges(right+1, right+1+size, end-size);
+  return std::make_pair(beg+(left-first_less), end-(last_greater-right));
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename RandomAccessIterator, typename Compare>
+std::pair<RandomAccessIterator,RandomAccessIterator>
+ternary_partition(RandomAccessIterator beg, RandomAccessIterator end,
+                 RandomAccessIterator pivot, Compare less)
+{
+  return ternary_partition(beg, end, pivot, less,
+                          default_key<RandomAccessIterator>());
+}
+
+template <typename RandomAccessIterator>
+std::pair<RandomAccessIterator,RandomAccessIterator>
+ternary_partition(RandomAccessIterator beg, RandomAccessIterator end,
+                 RandomAccessIterator pivot)
+{
+  typedef typename std::iterator_traits<RandomAccessIterator>::value_type
+    key_type;
+  return ternary_partition(beg, end, pivot, std::less<key_type>(),
+                          default_key<RandomAccessIterator>());
+}
+
+
+//======================================================================
+// Sorting
+//======================================================================
+
+//----------------------------------------------------------------------
+// function insertion_sort
+//
+//----------------------------------------------------------------------
+template <typename RandomAccessIterator, typename Compare, 
+         typename KeyAccess>
+void
+insertion_sort(RandomAccessIterator beg, RandomAccessIterator end, 
+              Compare less, KeyAccess key)
+{
+  if (end-beg <= 1) return;
+  for (RandomAccessIterator i = beg+1; i != end; ++i) {
+    typename KeyAccess::result_type i_key = key(i); // i is not moved ...
+    RandomAccessIterator j = i-1;
+    if (!less(i_key, key(j))) continue;
+    for ( ; j != beg; --j) {
+      if (!less(i_key, key(j-1))) break;
+      std::swap(*(j-1), *j);
+    }
+    std::swap(*i, *j);                              // ... until here
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename RandomAccessIterator, typename Compare>
+void
+insertion_sort(RandomAccessIterator beg, RandomAccessIterator end, 
+              Compare less)
+{
+  insertion_sort(beg, end, less, default_key<RandomAccessIterator>());
+}
+
+template <typename RandomAccessIterator>
+void
+insertion_sort(RandomAccessIterator beg, RandomAccessIterator end)
+{
+  typedef typename std::iterator_traits<RandomAccessIterator>::value_type
+    key_type;
+  insertion_sort(beg, end, std::less<key_type>(), 
+                default_key<RandomAccessIterator>());
+}
+
+
+//----------------------------------------------------------------------
+// function quicksort
+//
+//----------------------------------------------------------------------
+template <typename RandomAccessIterator, typename Compare, 
+         typename KeyAccess>
+void
+quicksort(RandomAccessIterator beg, RandomAccessIterator end, 
+         Compare less, KeyAccess key)
+{
+  while (end-beg > 15) {
+    RandomAccessIterator pivot = random_pivot(beg, end, less, key);
+    RandomAccessIterator mid = binary_partition(beg, end, pivot, less, key);
+    quicksort(beg, mid, less, key);
+    beg = mid;
+  }
+  insertion_sort(beg, end, less, key);
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+
+template <typename RandomAccessIterator, typename Compare>
+void
+quicksort(RandomAccessIterator beg, RandomAccessIterator end, Compare less)
+{
+  quicksort(beg, end, less, default_key<RandomAccessIterator>());
+}
+
+template <typename RandomAccessIterator>
+void
+quicksort(RandomAccessIterator beg, RandomAccessIterator end)
+{
+  typedef typename std::iterator_traits<RandomAccessIterator>::value_type
+    key_type;
+  quicksort(beg, end, std::less<key_type>(), 
+           default_key<RandomAccessIterator>());
+}
+
+
+#endif // PARTITION_HPP
diff --git a/dcover/stringsort.hpp b/dcover/stringsort.hpp

new file mode 100644 (file)

index 0000000..faef78f
--- /dev/null
+++ b/dcover/stringsort.hpp
@@ -0,0 +1,337 @@
+/* 
+ * stringsort.hpp
+ *
+ * Copyright 2003 Max-Planck-Institut fuer Informatik 
+ *
+ */
+
+//======================================================================
+// Multikey quicksort for suffixes
+//======================================================================
+
+#ifndef STRINGSORT_HPP
+#define STRINGSORT_HPP
+
+#include "partition.hpp"
+
+#include <algorithm>
+#include <iterator>
+#include <functional>
+#include <vector>
+
+
+//----------------------------------------------------------------------
+// function string_compare
+//
+// Compare two strings until a mismatch but no longer than limit 
+// characters.  The sign of the result indicates the order of the 
+// strings and the absolute value the distance of the mismatch from 
+// the limit. Zero means that no mismatch was found within limit 
+// characters.
+//
+// WARNING: A check for the end of a string is user's responsibility.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename DifferenceType,
+         typename CharCompare>
+DifferenceType
+string_compare(StringIterator s1, StringIterator s2, 
+              DifferenceType limit, CharCompare less) {
+  for ( ; limit > 0; --limit, ++s1, ++s2) {
+    if (less(*s1,*s2)) return -limit;
+    if (less(*s2,*s1)) return limit;
+  }
+  return 0;
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename DifferenceType>
+inline DifferenceType
+string_compare(StringIterator s1, StringIterator s2, DifferenceType limit) 
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  return string_compare(s1, s2, limit, std::less<char_type>());
+}
+
+
+//----------------------------------------------------------------------
+// const int suffix_insertion_sort_threshold_
+//
+// Groups of suffixes with size less or equal to this value are 
+// sorted by insertion sort.
+//----------------------------------------------------------------------
+static const int suffix_insertion_sort_threshold_ = 15;
+
+//----------------------------------------------------------------------
+// function suffix_insertion_sort
+// 
+// Sort suffixes using insertion sort.
+// A difference from a straightforward insertion sort is that
+// the lengths of longest common prefixes of the already sorted 
+// suffixes are stored and used in later insertions.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler, typename CharCompare>
+void
+suffix_insertion_sort(StringIterator str, StringIterator str_end,
+                     SArrayIterator sa, SArrayIterator sa_end,
+     typename std::iterator_traits<StringIterator>::difference_type limit,
+                     GroupHandler handle_unsorted_group,
+                     CharCompare less)
+{
+  typedef typename std::iterator_traits<StringIterator>::difference_type 
+    str_diff_type;
+  typedef typename std::iterator_traits<SArrayIterator>::difference_type 
+    sa_diff_type;
+  typedef typename std::iterator_traits<SArrayIterator>::value_type 
+    pos_type;
+  
+  static std::vector<str_diff_type> lcp(suffix_insertion_sort_threshold_);
+  lcp.resize(sa_end-sa);
+  lcp.back() = -1;
+  
+  for (sa_diff_type i = sa_end-sa-2; i >= 0; --i) {
+    pos_type i_pos = sa[i];
+    StringIterator i_str = str + i_pos;
+    StringIterator j_str = str + sa[i+1];
+    str_diff_type i_limit = std::min(limit, str_end - i_str);
+    str_diff_type ij_limit = std::min(i_limit, str_end - j_str);
+    str_diff_type dist = string_compare(i_str, j_str, ij_limit, less);
+    if (dist < 0 || (dist == 0 && i_limit == ij_limit)) {
+      lcp[i] = ij_limit + dist;
+    } else {
+      sa[i] = sa[i+1];
+      str_diff_type i_lcp = ij_limit - dist;
+
+      sa_diff_type j = i+1;
+      for ( ; lcp[j] >= i_lcp; ++j) {
+       if (lcp[j] == i_lcp) {
+         j_str = str+sa[j+1];
+         ij_limit = std::min(i_limit, str_end-j_str);
+         dist = string_compare(i_str+i_lcp, j_str+i_lcp, 
+                               ij_limit-i_lcp, less);
+         if (dist < 0 || (dist == 0 && i_limit == ij_limit)) {
+           lcp[j] = ij_limit + dist;
+           break;
+         } else {
+           i_lcp = ij_limit - dist;
+         }
+       }
+       sa[j] = sa[j+1];
+       lcp[j-1] = lcp[j];
+      }
+
+      sa[j] = i_pos;
+      lcp[j-1] = i_lcp;
+    }
+  }
+
+  // find and process unsorted groups
+  typename std::vector<pos_type>::iterator beg, end, last = lcp.end()-1;
+  for (beg = lcp.begin(); beg < last; ++beg) {
+    if (*beg == limit) {
+      end = beg;
+      do { ++end; } while (*end == limit);
+      handle_unsorted_group(sa+(beg-lcp.begin()), sa+(end-lcp.begin())+1);
+      beg = end;
+    }
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler>
+void
+suffix_insertion_sort(StringIterator str, StringIterator str_end,
+                     SArrayIterator sa, SArrayIterator sa_end,
+    typename std::iterator_traits<StringIterator>::difference_type limit,
+                     GroupHandler handle_unsorted_group)
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  suffix_insertion_sort(str, str_end, sa, sa_end, limit,
+                       handle_unsorted_group, std::less<char_type>());
+}
+
+
+//----------------------------------------------------------------------
+// functor char_access
+//
+// Compute the character that represents a suffix in comparisons.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator>
+class char_access
+  : public std::unary_function<SArrayIterator,
+          typename std::iterator_traits<StringIterator>::value_type>
+{
+private:
+  const StringIterator str;
+public:
+  char_access(StringIterator s) : str(s) {}
+
+  typename std::iterator_traits<StringIterator>::value_type
+  operator() (SArrayIterator i) const {
+    return str[*i]; 
+  }
+};
+
+
+//======================================================================
+// function suffix_mkqsort
+// function suffix_mkqsort_noempty
+//
+// Sort suffixes using multikey quicksort.
+// These two functions call each other to implement multikey quicksort.
+// Either can be used as the initial call. suffix_mkqsort_noempty
+// places more restrictions on the input but can be slightly faster.
+//======================================================================
+
+//----------------------------------------------------------------------
+// function suffix_mkqsort
+//
+// Sort suffixes using multikey quicksort.
+// This is the more general form.
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler, typename CharCompare>
+void
+suffix_mkqsort(StringIterator str, StringIterator str_end,
+              SArrayIterator sa, SArrayIterator sa_end,
+    typename std::iterator_traits<StringIterator>::difference_type limit,
+              GroupHandler handle_unsorted_group,
+              CharCompare less)
+{                
+  typedef typename std::iterator_traits<SArrayIterator>::value_type pos_type;
+
+  while (sa_end-sa > suffix_insertion_sort_threshold_ && limit > 0) {
+
+    // check for empty suffix(es)
+    pos_type length = static_cast<pos_type>(str_end-str);
+    sa = std::partition(sa, sa_end, 
+                       std::bind2nd(std::equal_to<pos_type>(), length));
+
+    // this would check only for one empty suffix
+    //SArrayIterator empty = std::find(sa, sa_end, length);
+    //if (empty != sa_end) {
+    //  std::swap(*sa, *empty);
+    //  ++sa;
+    //}
+
+    // partition
+    char_access<StringIterator, SArrayIterator> key(str);
+    SArrayIterator pivot = random_pivot(sa, sa_end, less, key);
+    std::pair<SArrayIterator,SArrayIterator> midrange;
+    midrange = ternary_partition(sa, sa_end, pivot, less, key);
+
+    // recurse on <-part
+    if (midrange.first - sa > 1) {
+      suffix_mkqsort_noempty(str, str_end, sa, midrange.first, 
+                            limit, handle_unsorted_group, less);
+    }
+
+    // recurse on >-part
+    if (sa_end - midrange.second > 1) {
+      suffix_mkqsort_noempty(str, str_end, midrange.second, sa_end, 
+                            limit, handle_unsorted_group, less);
+    }
+
+    // loop on =-part
+    sa = midrange.first;
+    sa_end = midrange.second;
+
+    // move to the next position in the suffixes
+    ++str;
+    --limit;
+
+  }
+
+  if (sa_end-sa > 1) {
+    if (limit == 0) {
+      handle_unsorted_group(sa, sa_end);
+    } else {
+      suffix_insertion_sort(str, str_end, sa, sa_end, limit, 
+                           handle_unsorted_group, less);
+    }
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler>
+void
+suffix_mkqsort(StringIterator str, StringIterator str_end,
+              SArrayIterator sa, SArrayIterator sa_end,
+     typename std::iterator_traits<StringIterator>::difference_type limit,
+              GroupHandler handle_unsorted_group)
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  suffix_mkqsort(str, str_end, sa, sa_end, limit,
+                handle_unsorted_group, std::less<char_type>());
+}
+
+
+//----------------------------------------------------------------------
+// function suffix_mkqsort_noempty
+//
+// Sort suffixes using multikey quicksort.
+// Used as a subroutine of suffix_mkqsort but can also be called 
+// directly.  Places more restrictions on input (see below) but
+// can be slightly faster than suffix_mkqsort.
+//
+// The additional restrictions:
+// PRECONDITION: limit>0 and there are no empty suffixes
+//----------------------------------------------------------------------
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler, typename CharCompare>
+void
+suffix_mkqsort_noempty(StringIterator str, StringIterator str_end,
+                      SArrayIterator sa, SArrayIterator sa_end,
+    typename std::iterator_traits<StringIterator>::difference_type limit,
+                      GroupHandler handle_unsorted_group,
+                      CharCompare less)
+{                
+  typedef typename std::iterator_traits<SArrayIterator>::value_type pos_type;
+
+  while (sa_end-sa > suffix_insertion_sort_threshold_) {
+
+    // partition
+    char_access<StringIterator, SArrayIterator> key(str);
+    SArrayIterator pivot = random_pivot(sa, sa_end, less, key);
+    std::pair<SArrayIterator,SArrayIterator> midrange;
+    midrange = ternary_partition(sa, sa_end, pivot, less, key);
+
+    // recurse on <-part
+    if (midrange.first - sa > 1) {
+      suffix_mkqsort_noempty(str, str_end, sa, midrange.first, 
+                            limit, handle_unsorted_group, less);
+    }
+
+    // recurse on =-part
+    if (midrange.second - midrange.first > 1) {
+      suffix_mkqsort(str+1, str_end, midrange.first, midrange.second, 
+                    limit-1, handle_unsorted_group, less);
+    }
+
+    // loop on >-part
+    sa = midrange.second;
+  }
+
+  if (sa_end-sa > 1) {
+    suffix_insertion_sort(str, str_end, sa, sa_end, limit, 
+                         handle_unsorted_group, less);
+  }
+}
+// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
+template <typename StringIterator, typename SArrayIterator,
+         typename GroupHandler>
+void
+suffix_mkqsort_noempty(StringIterator str, StringIterator str_end,
+                      SArrayIterator sa, SArrayIterator sa_end,
+      typename std::iterator_traits<StringIterator>::difference_type limit,
+                      GroupHandler handle_unsorted_group)
+{
+  typedef typename std::iterator_traits<StringIterator>::value_type
+    char_type;
+  suffix_mkqsort_noempty(str, str_end, sa, sa_end, limit,
+                        handle_unsorted_group, std::less<char_type>());
+}
+    
+#endif // STRINGSORT_HPP
author	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Fri, 6 Mar 2009 13:39:13 +0000 (13:39 +0000)
committer	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Fri, 6 Mar 2009 13:39:13 +0000 (13:39 +0000)
dcover/bwt.hpp	[new file with mode: 0644]	patch \| blob
dcover/dcsamplesort.hpp	[new file with mode: 0644]	patch \| blob
dcover/dcsuffixsort.hpp	[new file with mode: 0644]	patch \| blob
dcover/difference_cover.cpp	[new file with mode: 0644]	patch \| blob
dcover/difference_cover.hpp	[new file with mode: 0644]	patch \| blob
dcover/distribute.hpp	[new file with mode: 0644]	patch \| blob
dcover/doubling.hpp	[new file with mode: 0644]	patch \| blob
dcover/kmpsplit.hpp	[new file with mode: 0644]	patch \| blob
dcover/partition.hpp	[new file with mode: 0644]	patch \| blob
dcover/stringsort.hpp	[new file with mode: 0644]	patch \| blob