From: nvalimak Date: Fri, 6 Mar 2009 13:39:13 +0000 (+0000) Subject: BWT via Difference cover X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=368c3edc500ee74859732705e46b7c346e8a6d65;hp=7eaf8bbcb18ab4a481905d219ab0b9e8286c75b2;p=SXSI%2FTextCollection.git BWT via Difference cover git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@209 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- diff --git a/dcover/bwt.hpp b/dcover/bwt.hpp new file mode 100644 index 0000000..b717226 --- /dev/null +++ b/dcover/bwt.hpp @@ -0,0 +1,456 @@ +/* + * bwt.hpp + * + * Copyright 2004 Juha Ka"rkka"inen + * + */ + +#include "dcsuffixsort.hpp" +#include "dcsamplesort.hpp" +#include "stringsort.hpp" +#include "difference_cover.hpp" +#include "distribute.hpp" +#include "kmpsplit.hpp" +/* +#include "doubling.hpp" +*/ + +#include +#include +#include +#include +#include +#include +/* +#include +#include +*/ + +#ifndef DCOVER +#define DCOVER 8 +#endif + +#ifndef RANDSEED +#define RANDSEED 46 +#endif + +#define MIN_PIVOTS 3 +#define OVERSAMPLING_RATE 10.0 + + +struct do_nothing { + template + void operator() (Iterator, Iterator) {} +}; + + + + +template +typename std::iterator_traits::difference_type +compute_bwt (StringIterator str, StringIterator str_end, + OutputIterator bwt, unsigned endmarkers, unsigned int dcover = DCOVER) +{ + + typedef typename std::iterator_traits::difference_type + difftype; // signed integral type + typedef unsigned long sizetype; // unsigned integral type + typedef difftype postype; // type for representing vector positions + + typedef typename std::vector::iterator pos_iterator; + +#if DEBUG > 0 + std::cout << "BWT construction starts" << std::endl; +#endif + + //----------------- + // setup and check + //----------------- + + sizetype length = std::distance(str, str_end); + +#if DEBUG > 1 + typedef typename std::iterator_traits::value_type + chartype; + typedef typename std::iterator_traits::value_type + bwttype; + std::cout << " text memory=" << length*sizeof(chartype) << "\n" + << " bwt memory=" << (length+1)*sizeof(chartype) << std::endl; +#endif + + dc_sampler dcsampler(length+1, dcover); + +#if DEBUG > 1 + std::cout << " dcsampler memory=" + << (dcsampler.packed_period() + 2*dcsampler.period()) + * sizeof(unsigned) + << std::endl; +#endif + + sizetype blocksize = dcsampler.samplesize(); + sizetype npivots = MIN_PIVOTS; + + // make average bucket size <= blocksize/OVERSAMPLING_RATE + while ((OVERSAMPLING_RATE*length)/(npivots+1.0) > blocksize) ++npivots; + +#if DEBUG > 1 + std::cout << " length=" << length + << " samplesize=" << dcsampler.samplesize() + << " blocksize=" << blocksize + << " npivots=" << npivots + << std::endl; +#endif + + if (dcsampler.period() + 2*npivots > length) { + // too small text for sampling + +#if DEBUG > 0 + std::cout << "small text: sort all suffixes\n"; +#endif + + std::vector sa(length); + for (pos_iterator i=sa.begin(); i!=sa.end(); ++i) { + *i = i - sa.begin(); + } + suffix_mkqsort(str, str_end, sa.begin(), sa.end(), length, do_nothing()); + difftype eof_position = 0; + for (pos_iterator i=sa.begin(); i!=sa.end(); ++i) { + if (*i != 0) { + if (str[*i-1] <= endmarkers) + *bwt++ = 0; + else + *bwt++ = str[*i-1] - endmarkers; + } else { + if (str[*(i-1)-1] <= endmarkers) + *bwt++ = 0; + else + *bwt++ = str[*(i-1)-1] - endmarkers; + eof_position = i - sa.begin() +1; // FIXME + } + } + return eof_position; + } + + // don't take too short suffixes as pivots + unsigned int pivotrange = length-dcsampler.period()+1; + + //----------------------------------- + // setup difference cover sample + //----------------------------------- + +#if DEBUG > 0 + std::cout << "difference cover construction ... " << std::endl; +#endif + + sizetype samplesize = dcsampler.samplesize(); + std::vector dcranks(samplesize); + std::vector suffixes(samplesize); + +#if DEBUG > 1 + std::cout << "dcranks memory=" + << samplesize * sizeof(difftype) + << "\n"; + std::cout << "suffixes memory=" + << samplesize * sizeof(difftype) + << std::endl; +#endif + + sort_dc_sample(dcsampler, str, str_end, + dcranks.begin(), dcranks.end(), + suffixes.begin(), suffixes.end()); + +#if DEBUG > 3 + std::cout << "sample ranks:\n"; + std::copy(dcranks.begin(), dcranks.end(), + std::ostream_iterator(std::cout,"\n")); + std::cout << "end of sample ranks" << std::endl; +#endif + + + + //------------------- + // setup pivots + //------------------- + +#if DEBUG > 0 + std::cout << "setting up pivots ..." << std::endl; +#endif + + std::set pivotset; + double scale_factor = (pivotrange * 1.0L)/(RAND_MAX+1.0L); + std::srand(RANDSEED); + while (pivotset.size() < npivots) { + int r = std::rand(); + int newpivot = static_cast(scale_factor*r); + pivotset.insert(newpivot); + } + +#if DEBUG > 1 + std::cout << " pivotset memory = " << pivotset.size() + << " elements" << std::endl; +#endif + + std::vector pivots(pivotset.begin(), pivotset.end()); + pivotset.clear(); + + // sort the pivots + dc_sort_suffixes(str, str_end, pivots.begin(), pivots.end(), + dcsampler, dcranks.begin(), dcranks.end()); + +#if DEBUG > 2 + std::cout << "pivots sorted lexicographically:\n"; + for (pos_iterator i = pivots.begin(); i != pivots.end(); ++i) { + std::cout << std::setw(8) << *i << " "; + std::copy(str+*i, std::min(str_end, str+*i+40), + std::ostream_iterator(std::cout,"")); + std::cout << std::endl; + } +#endif + + +#if DEBUG > 0 + std::cout << " constructing distributor ... " << std::endl; +#endif + + suffix_distributor, pos_iterator> + distributor(str, str_end, pivots.begin(), pivots.end(), + dcsampler, dcranks.begin()); + +#if DEBUG > 1 + distributor.print(); +#endif + + + + //------------------------ + // compute bucketsizes + //------------------------ + +#if DEBUG > 0 + std::cout << "computing bucket sizes ... " << std::endl; +#endif + + std::vector buckets(distributor.nbuckets()); + +#if DEBUG > 1 + std::cout << " number of buckets: " << distributor.nbuckets() << "\n"; + std::cout << " bucket vector memory=" + << buckets.size() * sizeof(difftype) + << std::endl; +#endif + + for (StringIterator suf = str; suf != str_end; ++suf) { + unsigned int bucket = distributor.find_bucket(suf); + ++buckets[bucket]; + +#if DEBUG > 3 + std::cout << " into bucket " << bucket; + std::cout << ":" << std::setw(7) << suf-str << " "; + std::copy(suf, std::min(str_end, suf+40), + std::ostream_iterator(std::cout,"")); + std::cout << std::endl; +#endif + + } + // ++buckets[0]; // empty suffix + +#if DEBUG > 1 + std::cout << " bucket sizes: "; + std::copy(buckets.begin(), buckets.end(), + std::ostream_iterator(std::cout," ")); + std::cout << std::endl; +#endif + + + //----------------------- + // build and sort blocks + //----------------------- + +#if DEBUG > 0 + std::cout << "Computing and sorting blocks ... " << std::endl; +#endif + +// *bwt++ = *(str_end-1); //FIXME no empty suffix? + + postype eof_position = 0; + sizetype total_sum = 1; + + postype bucket_begin = 0; + while (bucket_begin < (postype)buckets.size()) { + + sizetype sum = buckets[bucket_begin]; + postype bucket_end = bucket_begin + 1; + while (bucket_end < (postype)buckets.size() + && sum+buckets[bucket_end] <= blocksize) { + sum += buckets[bucket_end]; + ++bucket_end; + } + + postype left_pivot = bucket_begin-1; + postype right_pivot = bucket_end-1; + + short lcp = 0; + if (left_pivot >= 0 && right_pivot < (postype)pivots.size()) { +#if __WORDSIZE == 32 + lcp = suffix_lcp(str+pivots[left_pivot], str+pivots[right_pivot], + str_end, 0, dcsampler.period()-1); +#else + lcp = suffix_lcp(str+pivots[left_pivot], str+pivots[right_pivot], + str_end, 0l, dcsampler.period()-1); +#endif + } + +#if DEBUG > 1 + std::cout << "block [" << bucket_begin << "," << bucket_end << ")" + << " size=" << sum + << " lcp=" << lcp + << std::endl; +#endif + + if (sum > blocksize) { + blocksize = sum; + suffixes.resize(blocksize); + +#if DEBUG > 1 + std::cout << " blocksize increased to " + << blocksize << "\n"; + std::cout << " suffixes memory resized to " + << blocksize*sizeof(postype) + << std::endl; +#endif + + } + + pos_iterator i = suffixes.begin(); + //if (bucket_begin == 0) { *i++ = length; // empty suffix } + + postype left_pivot_not_beyond = left_pivot<0 ? 0 : left_pivot; + kmp_split, pos_iterator> + left_cmp(str, str_end, str+pivots[left_pivot_not_beyond], + dcsampler, dcranks.begin()); + + postype right_pivot_not_beyond = + right_pivot==(postype)pivots.size() ? right_pivot-1 : right_pivot; + kmp_split, pos_iterator> + right_cmp(str, str_end, str+pivots[right_pivot_not_beyond], + dcsampler, dcranks.begin()); + + for (StringIterator suf = str; suf != str_end; ++suf) { + +#if DEBUG > 3 + std::cout << ":" << std::setw(3) << suf-str << " "; + std::copy(suf, std::min(str_end, suf+40), + std::ostream_iterator(std::cout,"")); + std::cout << std::endl; +#endif + + /* + // is the suffix left of the block + if (left_pivot >=0) { + postype pivotpos = pivots[left_pivot]; + if (*suf < str[pivotpos]) continue; + int lcp = suffix_lcp(suf, str+pivotpos, + str_end, 0, dcsampler.period()-1); + if (lcp < dcsampler.period()-1) { + if (suf+lcp == str_end || suf[lcp] < (str+pivotpos)[lcp]) { + continue; + } + } else { + postype sufpos = suf-str; + std::pair result + = dcsampler.pair(sufpos, pivotpos); + if (dcranks[result.first] < dcranks[result.second]){ + continue; + } + } + } + + // is the suffix right of the block + if (right_pivot < (postype)pivots.size()) { + postype pivotpos = pivots[right_pivot]; + if (*suf > str[pivotpos]) continue; + int lcp = suffix_lcp(str+pivotpos, suf, + str_end, 0, dcsampler.period()-1); + if (lcp < dcsampler.period()-1) { + if (suf+lcp < str_end && (str+pivotpos)[lcp] < suf[lcp]) { + continue; + } + } else { + postype sufpos = suf-str; + std::pair result + = dcsampler.pair(pivotpos, sufpos); + if (dcranks[result.first] <= dcranks[result.second]){ + continue; + } + } + } + */ + + //postype bucket = distributor.find_bucket(suf); + + bool to_left = left_pivot>0 && left_cmp.is_next_less(); + bool to_right = right_pivot<(postype)pivots.size() && !right_cmp.is_next_less() ; + + if (to_left || to_right) { + //assert (bucket_begin > bucket || bucket >= bucket_end); + continue; + } + + // it is in the bucket + //postype bucket = distributor.find_bucket(suf); + + //if (bucket_begin > bucket || bucket >= bucket_end) continue; + + //assert (bucket_begin <= bucket && bucket < bucket_end); + + *i++ = suf-str; +#if DEBUG > 3 + //std::cout << "into bucket " << bucket; + std::cout << ":" << std::setw(3) << suf-str << " "; + std::copy(suf, std::min(str_end, suf+40), + std::ostream_iterator(std::cout,"")); + std::cout << std::endl; +#endif + + } + + assert(i == suffixes.begin()+sum); + + dc_sort_suffixes(str, str_end, suffixes.begin(), i, + dcsampler, dcranks.begin(), dcranks.end()); + +#if DEBUG > 3 + std::cout << "sorted bucket:\n"; + for (pos_iterator j = suffixes.begin(); j != i; ++j) { + std::cout << std::setw(3) << *j << " "; + std::copy(str+*j, std::min(str_end, str+*j+40), + std::ostream_iterator(std::cout,"")); + std::cout << std::endl; + } +#endif + + for (pos_iterator j=suffixes.begin(); j != i; ++j) { + if (*j != 0) { + if (str[*j-1] <= endmarkers) + *bwt++ = 0; + else + *bwt++ = str[*j-1] - endmarkers; + } else { + if (str[*(j-1)-1] <= endmarkers) + *bwt++ = 0; + else + *bwt++ = str[*(j-1)-1] - endmarkers; + eof_position = total_sum + (j - suffixes.begin()); + } + } + + total_sum += sum; + + bucket_begin = bucket_end; + } + + assert(total_sum == length+1); + + return eof_position; +} diff --git a/dcover/dcsamplesort.hpp b/dcover/dcsamplesort.hpp new file mode 100644 index 0000000..507cfad --- /dev/null +++ b/dcover/dcsamplesort.hpp @@ -0,0 +1,110 @@ +/* + * dcsamplesort.hpp + * + * Copyright 2003 Max-Planck-Institut fuer Informatik + * Copyright 2004 Juha Ka"rkka"inen + * + */ + +#ifndef DCSAMPLESORT_HPP +#define DCSAMPLESORT_HPP + +#include "difference_cover.hpp" +#include "doubling.hpp" +#include "stringsort.hpp" +/* +*/ + +#include +#include +/* +#include +#include +*/ + + +//---------------------------------------------------------------------- +// functor mark_unsorted_group +// +// When string sorting leaves a group of suffixes unsorted, +// it marks that group by calling this functor. +// +// The marking is done by one's complementing the first and last +// entries of the group. Complemented values are assumed to become +// negative. Complementing is used instead of negation to be able +// mark the zero. +//---------------------------------------------------------------------- +class mark_unsorted_group { +public: + template + void operator() (SArrayIterator beg, SArrayIterator end) const { + *beg = ~*beg; + --end; + *end = ~*end; // note: group of size one stays unmarked + } +}; + + + + +template +void +sort_dc_sample(Sampler sampler, + RandomAccessIterator1 str, RandomAccessIterator1 str_end, + RandomAccessIterator2 ranks, RandomAccessIterator2 ranks_end, + RandomAccessIterator3 suffixes, + RandomAccessIterator3 suffixes_end) +{ + int size = sampler.samplesize(); + if ((ranks_end-ranks != size) || (suffixes_end-suffixes != size)) { + std::ostringstream os; + os << "sort_dc_sample:" + << " wrong range size"; + throw std::logic_error(os.str()); + } + + // collect the sample + RandomAccessIterator3 beyond = sampler.fill(suffixes); + if (suffixes_end != beyond) { + std::ostringstream os; + os << "sort_dc_sample:" + << " sample.fill produced " << beyond-suffixes + << " samples but samplesize is " << size; + throw std::logic_error(os.str()); + } + + // sort the sample to limited depth + suffix_mkqsort(str, str_end, suffixes, suffixes_end, sampler.period(), + mark_unsorted_group()); + + // compute ranks + RandomAccessIterator3 first = suffixes; + while (first != suffixes_end) { + if (*first >= 0) { + RandomAccessIterator3 last; + ranks[sampler.pack(*first)] = first - suffixes; + for (last=first+1; last != suffixes_end && *last >= 0; ++last) { + ranks[sampler.pack(*last)] = last - suffixes; + } + *first = -(last-first); + first = last; + } else { + RandomAccessIterator3 last; + *first = ~*first; + for (last=first+1; *last >= 0; ++last) ; + *last = ~*last; + for ( ; first <= last; ++first) { + *first = sampler.pack(*first); + ranks[*first] = last - suffixes; + } + } + } + + // complete sorting with the doubling algorithm + doubling_sort(suffixes, suffixes_end, ranks, ranks_end, + sampler.packed_period()); + +} + +#endif // DCSAMPLESORT_HPP diff --git a/dcover/dcsuffixsort.hpp b/dcover/dcsuffixsort.hpp new file mode 100644 index 0000000..b1dcfbc --- /dev/null +++ b/dcover/dcsuffixsort.hpp @@ -0,0 +1,131 @@ +/* + * dcsuffixsort.hpp + * + * Copyright 2003 Max-Planck-Institut fuer Informatik + * Copyright 2004 Juha Ka"rkka"inen + * + */ + +#ifndef DCSUFFIXSORT_HPP +#define DCSUFFIXSORT_HPP + +#include "stringsort.hpp" +#include "difference_cover.hpp" +#include "partition.hpp" +/* +#include "doubling.hpp" +#include "stringsort.hpp" +#include "checker.hpp" +#include "timing.hpp" +*/ + +#include +#include +#include +#include +/* +#include +#include +#include +#include +*/ + +//---------------------------------------------------------------------- +// dc_suffix_compare +// +// compare two suffixes using difference covers +//---------------------------------------------------------------------- +template +class dc_suffix_compare { +private: + typedef typename std::iterator_traits::value_type + rank_type; + const RankIterator sample_ranks; + const dc_sampler& sample; +public: + dc_suffix_compare(RankIterator beg, const dc_sampler& s) + : sample_ranks(beg), sample(s) {} + template + bool operator() (StringPosType a, StringPosType b) const { + std::pair rep = sample.pair(a, b); + return sample_ranks[rep.first] < sample_ranks[rep.second]; + } +}; + + +//---------------------------------------------------------------------- +// functor mark_unsorted_group +// +// When string sorting leaves a group of suffixes unsorted, +// it marks that group by calling this functor. +// +// The marking is done by one's complementing the first and last +// entries of the group. Complemented values are assumed to become +// negative. Complementing is used instead of negation to be able +// mark the zero. +//---------------------------------------------------------------------- +class mark_unsorted_group2 { +public: + template + void operator() (SArrayIterator beg, SArrayIterator end) const { + *beg = ~*beg; + --end; + *end = ~*end; // note: group of size one stays unmarked + } +}; + +/* +//---------------------------------------------------------------------- +// functor do_nothing +// +// Replaces mark_unsorted_group when nothing needs to be done +// to unsorted groups (this happens when doing stringsort only) +//---------------------------------------------------------------------- +class do_nothing { +public: + template + void operator() (Iterator, Iterator) {} +}; +*/ + + +//---------------------------------------------------------------------- +// function dc_sort_suffixes +// +// Sort a set of suffixes using a difference cover sample. +//---------------------------------------------------------------------- +template +void +dc_sort_suffixes(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + const DCSampler& sampler, + RankIterator ranks, RankIterator ranks_end) +{ + if (ranks_end-ranks != sampler.samplesize()) { + std::ostringstream os; + os << "dc_suffix_sort:" + << " wrong range size"; + throw std::logic_error(os.str()); + } + + suffix_mkqsort(str, str_end, sa, sa_end, sampler.period(), + mark_unsorted_group2()); + + SArrayIterator first = sa; + while (true) { + while (first != sa_end && *first >= 0) ++first; + if (first == sa_end) break; + *first = ~*first; + SArrayIterator last = first + 1; + while (*last >= 0) ++last; + *last = ~*last; + ++last; + quicksort(first, last, + dc_suffix_compare(ranks, sampler)); + first = last; + } + +} + +#endif // DCSUFFIXSORT_HPP diff --git a/dcover/difference_cover.cpp b/dcover/difference_cover.cpp new file mode 100644 index 0000000..6b6744c --- /dev/null +++ b/dcover/difference_cover.cpp @@ -0,0 +1,125 @@ +/* + * differerence_cover.cpp + * + * Copyright 2003 Max-Planck-institut fuer Informatik + * + */ + +//====================================================================== +// This file implements the computation of difference covers +//====================================================================== + +#include +#include +#include +#include + +#include "difference_cover.hpp" + + +//---------------------------------------------------------------------- +// Precomputed difference covers. +// +// The covers from 0 to 6 are from +// Luk, Wong: Two new quorum base algorithms for distributed mutual +// exclusion. In Proc. 17th International Conference on Distributed +// Computing Systems, pp. 100-106. IEEE, 1997. +// Covers 7 and 8 were found using an exhaustive search algorithm. +// All except cover 8 are known to be optimal. +//---------------------------------------------------------------------- +namespace { + const unsigned max_precomputed_cover = 8; + const unsigned coversizes[max_precomputed_cover+1] = + {1,2,3,4,5,7,9,13,20}; + const unsigned cover0[] = {0}; + const unsigned cover1[] = {0,1}; + const unsigned cover2[] = {0,1,2}; + const unsigned cover3[] = {0,1,2,4}; + const unsigned cover4[] = {0,1,2,5,8}; + const unsigned cover5[] = {0,1,2,3,7,11,19}; //{0,7,8,10,14,19,23}; + const unsigned cover6[] = {0,1,2,5,14,16,34,42,59}; + const unsigned cover7[] = {0,1,3,7,17,40,55,64,75,85,104,109,117}; + const unsigned cover8[] = {0,1,3,7,12,20,30,44,65,80,89,96,114,122, + 128,150,196,197,201,219}; + const unsigned * covers[] = { cover0, cover1, cover2, cover3, cover4, + cover5, cover6, cover7, cover8 }; +} + + +//---------------------------------------------------------------------- +// constructor of difference_cover +// +// Constructs a difference cover modulo a power of two. +// (see difference_cover.hpp) +// +// Small difference covers have been precomputed (see above). +// Larger difference covers are computed by the algorithm in +// C.J. Colbourn and A.C.H. Ling. Quorums from Difference Covers. +// Information Processing Letters 75(1-2):9-12, 2000 +//---------------------------------------------------------------------- +difference_cover::difference_cover(unsigned lv) + : logmod(lv), mask((1<::iterator i = cover.begin(); + *i = 0; ++i; + std::fill(i, i+r, 1); i+=r; + *i = r+1; ++i; + std::fill(i, i+r, 2*r+1); i+=r; + std::fill(i, i+2*r+1, 4*r+3); i+=2*r+1; + std::fill(i, i+r+1, 2*r+2); i+=r+1; + std::fill(i, i+r, 1); i+=r; + if (i != cover.end()) { + throw std::logic_error("error in the Coulbourn-Ling algorithm"); + } + std::partial_sum(cover.begin(), cover.end(), cover.begin()); + + } + +#if DEBUG > 1 + std::cerr << "difference cover modulo " << (1<(std::cerr, " ") ); + std::cerr << std::endl; +#endif + + // compute the lookup tables + unsigned i, j, modulus=1< 2 + std::cerr << i << " = " << ((diff2pos[i]+i)&mask); + std::cerr << " - " << diff2pos[i]; + std::cerr << " (mod " << modulus << ")"; + std::cerr << std::endl; +#endif + } +} diff --git a/dcover/difference_cover.hpp b/dcover/difference_cover.hpp new file mode 100644 index 0000000..0fe116e --- /dev/null +++ b/dcover/difference_cover.hpp @@ -0,0 +1,163 @@ +/* + * difference_cover.hpp + * + * Copyright 2003 Max-Planck-institut fuer Informatik + * + */ + +//====================================================================== +// This file defines two classes: +// +// class difference_cover implements basic difference covers +// modulo any power of two +// class dc_sampler implements the difference cover based sampling +// used in the suffix array construction +// +// Part of the implementation of class difference_cover is in +// difference_cover.cpp +//====================================================================== + +#ifndef DIFFERENCE_COVER_HPP +#define DIFFERENCE_COVER_HPP + +#include +#include +//#include + +//---------------------------------------------------------------------- +// class difference_cover +// +// A difference cover D modulo v is a set of integers in the range [0,v) +// such that for all i in [0,v), there exists j,k in D such that +// i = k-j (mod v). +// +// This class represents a difference cover modulo any power of two. +// The sizes of the difference covers for small v are: +// v 1 2 4 8 16 32 64 128 256 512 1024 ... +// |D| 1 2 3 4 5 7 9 13 20 28 40 ... +// For v<=128, the size of the difference cover is minimal. +// For any v, the size is less than sqrt(1.5*v)+6 (sqrt(v) is a lower bound). +// +// Notes: +// - All arithmetic is done with unsigned int, because it guarantees +// correct modular arithmetic including conversion to/from other +// integral types. This is a main reason why supporting values of v +// other than powers of two would be difficult. +//---------------------------------------------------------------------- +class difference_cover { + +public: + + // create a difference cover modulo 2^lv + explicit difference_cover(unsigned lv); // defined in difference_cover.cpp + + // basic dimensions + unsigned modulus() const { return 1< + IntType div(IntType i) const { return i>>logmod; } + + // mapping [0,modulus) <-> [0,size) + unsigned rank(unsigned pos) const { return coverrank[pos&mask]; } + unsigned pos(unsigned rank) const { return cover[rank]; } + + // compute k such that i+k and j+k are both in the cover + unsigned offset(unsigned i, unsigned j) const { + return (diff2pos[(j-i)&mask]-i)&mask; + } + + typedef std::vector::const_iterator iterator; + iterator begin() const { return cover.begin(); } + iterator end() const { return cover.end(); } + +private: + + unsigned logmod; // log v + unsigned mask; // v-1 = 11...1 in binary + unsigned coversize; // |D| + + std::vector cover; // D + std::vector coverrank; // rank of a value in D + std::vector diff2pos; // lookup table for finding a + // pair with a given difference +}; + + + +//---------------------------------------------------------------------- +// class dc_sampler +// +// Represents the periodic sample positions chosen according +// to a difference cover. +// +// The main duties are: +// 1. Fill the sparse suffix array with the sample positions. +// 2. Provide a mapping pack() from sample positions to packed positions. +// A packed position pack(i) of a sample position i is used as +// the corresponding position in the inverse sparse suffix array. +// pack(i) must satisfy: +// a) pack(i) is unique (different for each sample position i) and +// in the range [0,samplesize) +// b) The difference pack(i+v)-pack(i), where v is the length of +// the period (modulus of the difference cover), is the same for +// all i. This difference is called the packed period. +// 3. Find the representative pair for two arbitrary positions i and j. +// The representative pair is a pair of positions in the inverse +// sparse suffix array that can be used for comparing the suffixes +// i and j if they have a common prefix of length period-1. +// The representative pair is computed by finding k in [0,period) +// such that i+k and j+k are both sample positions and returning +// the packed positions corresponding to i+k and j+k. +//---------------------------------------------------------------------- +template +class dc_sampler { +public: + + dc_sampler(OffsetType s, OffsetType v) : dcover(v), fullsize_(s), + samplesize_(dcover.div(s)*dcover.size()+dcover.rank(s)) + {} + + OffsetType fullsize() const { return fullsize_; } + OffsetType samplesize() const { return samplesize_; } + OffsetType period() const { return dcover.modulus(); } + OffsetType packed_period() const { return dcover.size(); } + + OffsetType pack(OffsetType i) const { + return dcover.div(i) * dcover.size() + dcover.rank(i); + } + + // representative pair + std::pair + pair(OffsetType i, OffsetType j) const { + OffsetType k = dcover.offset(i, j); + return std::make_pair(pack(i+k), pack(j+k)); + } + + // fills a range with the sample positions + template + OutputIterator fill(OutputIterator it) const { + difference_cover::iterator j = dcover.begin(); + OffsetType i = 0; + OffsetType pos = i + static_cast(*j); + while (pos < fullsize()) { + *it++ = pos; + if (++j == dcover.end()) { + i += dcover.modulus(); + j = dcover.begin(); + } + pos = i + static_cast(*j); + } + return it; + } + +private: + + const difference_cover dcover; + const OffsetType fullsize_; // size of the range where samples are chosen + const OffsetType samplesize_; +}; + +#endif // DIFFERENCE_COVER_HPP diff --git a/dcover/distribute.hpp b/dcover/distribute.hpp new file mode 100644 index 0000000..04aea51 --- /dev/null +++ b/dcover/distribute.hpp @@ -0,0 +1,405 @@ +/* + * distribute.hpp + * + * Copyright 2004 Juha K"arkk"ainen + * + */ + +#ifndef DISTRIBUTE_HPP +#define DISTRIBUTE_HPP + +#include "dcsuffixsort.hpp" + +#include +#include +#include + + +//---------------------------------------------------------------------- +// suffix_lcp +// +// Compute the length of the longest common prefix between two +// suffixes a and b. +// str_end = end of the text +// minlcp = length of the already known common prefix +// maxlcp = stop when the length reaches maxlcp +//---------------------------------------------------------------------- +template +LcpType +suffix_lcp(StringIterator a, StringIterator b, StringIterator str_end, + LcpType minlcp, LcpType maxlcp) +{ + if (a < b) swap(a,b); // make a the shorter suffix + if ((str_end-a) > maxlcp) str_end = a + maxlcp; + std::pair mm = + std::mismatch(a+minlcp, str_end, b+minlcp); + //std::cout << " lcp=" << mm.first-a << "\n"; + return static_cast(mm.first-a); +} + + +//---------------------------------------------------------------------- +// class suffix_distributor +//---------------------------------------------------------------------- +template +class suffix_distributor +{ +private: + //typedef typename std::iterator_traits::difference_type + // lcptype; + typedef short lcptype; + typedef typename + std::iterator_traits::difference_type difftype; + + + StringIterator str, str_end; + SuffixIterator pivots, pivots_end; + std::vector lcps; + const DCSampler& dcsampler; + RankIterator dcranks; + lcptype maxlcp; + + typedef typename std::vector::iterator lcpiterator; + +public: + + suffix_distributor(StringIterator str_, StringIterator str_end_, + SuffixIterator pivots_, SuffixIterator pivots_end_, + const DCSampler& dcsampler_, RankIterator dcranks_) + : str(str_), str_end(str_end_), pivots(pivots_), pivots_end(pivots_end_), + lcps(pivots_end-pivots), dcsampler(dcsampler_), dcranks(dcranks_), + maxlcp(dcsampler.period()) + { + compute_lcps(); + } + + void reset(SuffixIterator pivots_, SuffixIterator pivots_end_) + { + pivots=pivots_; + pivots_end=pivots_end_; + lcps.resize(pivots_end-pivots); + + compute_lcps(); + } + + + difftype nbuckets() const{ return pivots_end-pivots+1; } + + difftype find_bucket(StringIterator suf) const { + return binary_search(suf); + //return i - pivots; + } + + void + print() const { +#if DEBUG > 1 + std::cout << "distributor memory=" + << lcps.size()*sizeof(lcptype) + << std::endl; +#if DEBUG > 2 + std::cout << "pivots and lcps:\n"; + for (SuffixIterator i = pivots; i != pivots_end; ++i) { + std::cout << std::setw(4) << i-pivots + << std::setw(4) << int(lcps[i-pivots]) + << std::setw(9) << *i << " "; + std::copy(str+*i, std::min(str_end, str+*i+40), + std::ostream_iterator(std::cout,"")); + std::cout << "\n"; + } + std::cout << "distributor end\n"; +#endif +#endif + } + + +private: + + bool dc_less_or_equal(difftype a, difftype b) const { + std::pair rep = dcsampler.pair(a, b); + return dcranks[rep.first] <= dcranks[rep.second]; + } + + //---------------------------------------------------------------------- + // compute_lcps_bounded + // + // Compute all the lcp values in a subrange bounded by + // actual pivot suffixes. + //---------------------------------------------------------------------- + void + compute_lcps_bounded(SuffixIterator left_suf, SuffixIterator right_suf, + lcpiterator left_lcp, + lcptype minlcp, lcptype maxlcp) + { + if (left_suf == right_suf) return; + difftype midpoint = (right_suf-left_suf)/2; + //std::cout << " midpoint of [0," << right_suf-left_suf + // << ")=" << midpoint << "\n"; + + SuffixIterator mid_suf = left_suf + midpoint; + lcpiterator mid_lcp = left_lcp + midpoint; + + lcptype llcp_left = suffix_lcp(str+*(left_suf-1), str+*mid_suf, + str_end, minlcp, maxlcp); + lcptype llcp_right = suffix_lcp(str+*mid_suf, str+*right_suf, + str_end, minlcp, maxlcp); + *mid_lcp = llcp_right - llcp_left; + //std::cout << *mid_lcp << std::endl; + + compute_lcps_bounded(left_suf, mid_suf, left_lcp, llcp_left, maxlcp); + compute_lcps_bounded(mid_suf+1, right_suf, mid_lcp+1, llcp_right, maxlcp); + } + + + //---------------------------------------------------------------------- + // compute_lcps + // + // Compute all the lcp values in the full range. + //---------------------------------------------------------------------- + void + compute_lcps() + { + if (pivots == pivots_end) return; + difftype centerpoint = (pivots_end - pivots)/2; + //std::cout << "midpoint of [0," << (pivots_end - pivots) << ")=" + // << centerpoint << "\n"; + lcps[centerpoint] = 0; + + // unbounded ranges on the left + difftype rightpoint = centerpoint; + while (rightpoint > 0) { + difftype midpoint = rightpoint/2; + //std::cout << "midpoint of [0," << rightpoint << ")=" << midpoint << "\n"; + lcptype llcp = suffix_lcp(str+pivots[midpoint], + str+pivots[rightpoint], + str_end, static_cast(0), maxlcp); + lcps[midpoint] = llcp; + //std::cout << llcp << std::endl; + compute_lcps_bounded(pivots+midpoint+1, pivots+rightpoint, + lcps.begin()+midpoint+1, llcp, maxlcp); + rightpoint = midpoint; + } + + // unbounded ranges on the right + SuffixIterator left_suf = pivots + centerpoint + 1; + lcpiterator left_lcp = lcps.begin() + centerpoint + 1; + while (left_suf != pivots_end) { + difftype midpoint = (pivots_end - left_suf)/2; + //std::cout << "midpoint of [" << (left_suf-pivots) << "," << + // (pivots_end-pivots) << ")=" << (left_suf-pivots)+midpoint << "\n"; + lcptype llcp = suffix_lcp(str+left_suf[-1], str+left_suf[midpoint], + str_end, static_cast(0), maxlcp); + left_lcp[midpoint] = -llcp; + //std::cout << -llcp << std::endl; + compute_lcps_bounded(left_suf, left_suf+midpoint, left_lcp, + llcp, maxlcp); + left_suf += midpoint + 1; + left_lcp += midpoint + 1; + } + + } + + + +#ifndef BINSEARCH + +//---------------------------------------------------------------------- +// binary_search +//---------------------------------------------------------------------- +difftype +//SuffixIterator +binary_search(StringIterator query) const +{ + //SuffixIterator left = pivots; + //SuffixIterator right = pivots_end; + //SuffixIterator mid; + + difftype left = 0; + difftype right = pivots_end-pivots; + difftype mid; + + lcptype minlcp = 0; + lcptype lcp_diff = 0; + lcptype lcp_left, lcp_right; + + in_the_middle: + while (true) { + /* + std::cout << "middle minlcp=" << int(minlcp) + << " lcp_diff=" << int(lcp_diff) + << " mid=" << left + (right-left)/2 + << "\n"; + */ + if (left == right) return left; + mid = left + (right-left)/2; + if (str+pivots[mid] == query) return mid+1; + lcptype lcp_mid = lcps[mid]; + if (lcp_mid == 0) { + lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end, + minlcp, maxlcp); + if (newlcp == maxlcp) { + lcp_left = lcp_right = minlcp; + goto fullmatch; + } + if (query+newlcp == str_end || + query[newlcp] < str[pivots[mid]+newlcp]) { + right = mid; + lcp_diff = newlcp - minlcp; + if (lcp_diff != 0) goto closer_to_right; + } else { + left = mid+1; + lcp_diff = minlcp - newlcp; + if (lcp_diff != 0) goto closer_to_left; + } + } else { + if (lcp_mid < lcp_diff) { + left = mid+1; + } else { + right = mid; + } + } + } + + closer_to_left: + while (true) { + /* + std::cout << "left minlcp=" << int(minlcp) + << " lcp_diff=" << int(lcp_diff) + << " mid=" << left + (right-left)/2 + << "\n"; + */ + if (left == right) return left; + mid = left + (right-left)/2; + if (str+pivots[mid] == query) return mid+1; + lcptype lcp_mid = lcps[mid]; + if (lcp_mid > lcp_diff) { + if (lcp_mid < 0) { + lcp_diff -= lcp_mid; + minlcp -= lcp_mid; + } + right = mid; + } else { + if (lcp_mid == lcp_diff) { + lcptype lcp = minlcp - lcp_diff; + lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end, + lcp, maxlcp); + if (newlcp == maxlcp) { + lcp_left = lcp; + lcp_right = minlcp; + goto fullmatch; + } + if (query+newlcp == str_end || + query[newlcp] < str[pivots[mid]+newlcp]) { + right = mid; + minlcp = lcp; + lcp_diff = newlcp - minlcp; + if (lcp_diff == 0) goto in_the_middle; + else goto closer_to_right; + } else { + left = mid+1; + lcp_diff = minlcp - newlcp; + } + } else { + left = mid+1; + } + } + } + + closer_to_right: + while (true) { + /* + std::cout << "right minlcp=" << int(minlcp) + << " lcp_diff=" << int(lcp_diff) + << " mid=" << left + (right-left)/2 + << "\n"; + */ + if (left == right) return left; + mid = left + (right-left)/2; + if (str+pivots[mid] == query) return mid+1; + lcptype lcp_mid = lcps[mid]; + if (lcp_mid < lcp_diff) { + if (lcp_mid > 0) { + lcp_diff -= lcp_mid; + minlcp += lcp_mid; + } + left = mid+1; + } else { + if (lcp_mid == lcp_diff) { + lcptype lcp = minlcp + lcp_diff; + lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end, + lcp, maxlcp); + if (newlcp == maxlcp) { + lcp_left = minlcp; + lcp_right = lcp; + goto fullmatch; + } + if (query+newlcp == str_end || + query[newlcp] < str[pivots[mid]+newlcp]) { + right = mid; + lcp_diff = newlcp - minlcp; + } else { + left = mid+1; + minlcp = lcp; + lcp_diff = minlcp - newlcp; + if (lcp_diff == 0) goto in_the_middle; + else goto closer_to_left; + } + } + right = mid; + } + } + + fullmatch: + // find the range of fully matching pivots + //std::cout << "fullmatch: left\n"; + { + difftype right_tmp = mid; + while (left != right_tmp) { + difftype mid = left + (right_tmp-left)/2; + //std::cout << left << mid << right_tmp << std::endl; + difftype lcp_diff = lcps[mid]; + //std::cout << " " << lcp_left << " " << lcp_diff << std::endl; + if (lcp_left + lcp_diff == maxlcp) { + right_tmp = mid; + } else { + if (lcp_diff > 0) { lcp_left += lcp_diff; } + left = mid+1; + } + } + } + //std::cout << "fullmatch: right\n"; + { + difftype left_tmp = mid+1; + while (left_tmp != right) { + difftype mid = left_tmp + (right-left_tmp)/2; + //std::cout << left_tmp << mid << right << std::endl; + difftype lcp_diff = lcps[mid]; + //std::cout << " " << lcp_right << " " << lcp_diff << std::endl; + if (lcp_right - lcp_diff == maxlcp) { + left_tmp = mid+1; + } else { + if (lcp_diff < 0) { lcp_right -= lcp_diff; } + right = mid; + } + } + } + //std::cout << "fullmatch: dcsearch\n"; + while (left != right) { + difftype mid = left + (right-left)/2; + //std::cout << left << mid << right << std::endl; + if (dc_less_or_equal(pivots[mid], query-str)) { + left = mid+1; + } else { + right = mid; + } + } + return left; + +} + +#endif + +}; + + +#endif // DISTRIBUTE_HPP diff --git a/dcover/doubling.hpp b/dcover/doubling.hpp new file mode 100644 index 0000000..3f8b29e --- /dev/null +++ b/dcover/doubling.hpp @@ -0,0 +1,204 @@ +/* + * doubling.hpp + * + * Copyright 2003 Max-Planck-Institut fuer Informatik + * + */ + +//====================================================================== +// Doubling algorithm for suffix array construction. +// Follows closely the algorithm described in +// +// N.J. Larsson and K. Sadakane. Faster Suffix Sorting. +// Technical Report LU-CS-TR:99-214, Dept. of Computer Science, +// Lund University, Sweden, 1999. +//====================================================================== + +#ifndef DOUBLING_HPP +#define DOUBLING_HPP + +#include "partition.hpp" + +#include +#include +#include +#include +#include + +//---------------------------------------------------------------------- +// functor isa_access +// +// Given a suffix array iterator computes the corresponding +// inverse suffix array entry to be used in comparisons. +//---------------------------------------------------------------------- +template +class isa_access + : public std::unary_function::value_type> +{ +private: + typedef typename std::iterator_traits::difference_type + difference_type; + const ISAIterator isa; +public: + isa_access(ISAIterator s, difference_type lcp) : isa(s+lcp) {} + typename std::iterator_traits::value_type + operator() (SAIterator i) const { + return isa[*i]; + } +}; + +//---------------------------------------------------------------------- +// function update_group +// +// Update a group after doubling. +// WARNING: cannot handle an empty group +//---------------------------------------------------------------------- +template +void update_group(SAIterator beg, SAIterator end, + SAIterator sa, ISAIterator isa) +{ + typedef typename std::iterator_traits::value_type + pos_type; + pos_type groupnumber = std::distance(sa, end) - 1; + if (end-beg > 1) { + for (SAIterator i = beg; i != end; ++i) { + isa[*i] = groupnumber; + } + } else { + isa[*beg] = groupnumber; + *beg = -1; + } +} + + +//---------------------------------------------------------------------- +// function selection_partition +// +// Performs doubling partitioning for small groups. +//---------------------------------------------------------------------- +template +void selection_partition(SAIterator sa, ISAIterator isa, + SAIterator beg, SAIterator end, + typename std::iterator_traits::difference_type lcp) +{ + typedef typename std::iterator_traits::value_type + pos_type; + isa_access key(isa, lcp); + while (end-beg > 1) { + SAIterator min_end = beg+1; + pos_type min_key = key(beg); + for (SAIterator i = beg+1; i != end; ++i) { + if (key(i) < min_key) { + std::swap(*beg, *i); + min_end = beg+1; + min_key = key(beg); + } else if (!(min_key < key(i))) { + std::swap(*min_end, *i); + ++min_end; + } + } + update_group(beg, min_end, sa, isa); + beg = min_end; + } + if (beg != end) update_group(beg, end, sa, isa); +} + + +//---------------------------------------------------------------------- +// function doubling_partition +// +// Given a group of suffixes with a common prefix of length lcp +// partition it to subgroups according to the next lcp characters. +// The next lcp characters are represented by a single value +// in the inverse suffix array. +//---------------------------------------------------------------------- +template +void doubling_partition(SAIterator sa, ISAIterator isa, + SAIterator beg, SAIterator end, + typename std::iterator_traits::difference_type lcp) +{ + typedef typename std::iterator_traits::value_type + pos_type; + + while (end-beg > 10) { + + // ternary partition + isa_access get_isa_entry(isa, lcp); + SAIterator pivot = random_pivot(beg, end, std::less(), + get_isa_entry); + std::pair midrange + = ternary_partition(beg, end, pivot, std::less(), + get_isa_entry); + + // recurse on <-part + if (beg != midrange.first) { + doubling_partition(sa, isa, beg, midrange.first, lcp); + } + + // update =-part + update_group(midrange.first, midrange.second, sa, isa); + + // loop on >-part + beg = midrange.second; + } + + selection_partition(sa, isa, beg, end, lcp); +} + + + +//====================================================================== +// function doubling_sort +// +// The main doubling algorithm +//====================================================================== +template +void +doubling_sort(SAIterator sa, SAIterator sa_end, + ISAIterator isa, ISAIterator isa_end, + typename std::iterator_traits::difference_type lcp) +{ + typedef typename std::iterator_traits::difference_type + difference_type; + + difference_type size = std::distance(sa, sa_end); + if (isa_end-isa != size) { + std::ostringstream os; + os << "doubling_sort:" + << " wrong range size"; + throw std::logic_error(os.str()); + } + + while (*sa > -size) { + + SAIterator beg = sa; + difference_type nrunlen = 0; + while (beg != sa_end) { + difference_type val = *beg; + if (val < 0) { + // jump over already sorted part + beg -= val; + nrunlen += val; + } else { + // combine preceding sorted parts + if (nrunlen < 0) { + *(beg+nrunlen) = nrunlen; + nrunlen = 0; + } + // partition unsorted group + SAIterator end = sa + isa[val] + 1; + doubling_partition(sa, isa, beg, end, lcp); + beg = end; + } + } + if (nrunlen < 0) { + *(beg+nrunlen) = nrunlen; + } + + lcp *= 2; + } +} + + +#endif // DOUBLING_HPP diff --git a/dcover/kmpsplit.hpp b/dcover/kmpsplit.hpp new file mode 100644 index 0000000..77e7d99 --- /dev/null +++ b/dcover/kmpsplit.hpp @@ -0,0 +1,196 @@ + +#include +#include +#include + +template +class kmp_split { +public: + + kmp_split(Iterator s, Iterator se, Iterator p, + const DCSampler& d, RankIterator r) + : str(s), str_end(se), suf(str), pivot(p), pivot_end(p+d.period()-1), + last(1), last_lcp(0), dcsampler(d), dcranks(r) + { + preprocess(); + } + +private: + + bool dc_less(Iterator a, Iterator b) const { + std::pair rep = dcsampler.pair(a-str, b-str); + return dcranks[rep.first] <= dcranks[rep.second]; + } + +public: + + bool is_next_less() { + // state: + // suf == str_end || + // last_lcp == lcp((suf-last), pivot) && + // last_less == (suf-last)+last_lcp == str_end || + // (suf-last)[last_lcp] < pivot[last_lcp] + + bool result = true; + if (suf==str_end) return true; + if (suf==pivot) { + last_lcp = pivot_end-pivot; + last_less = false; + ++suf; last=1; + return false; + } + + /* + std::cout << " last=" << last + << " last_lcp=" << last_lcp + << " last+lcp[last]=" << last+lcp[last] + << " last_less=" << last_less + << " less[last]=" << bool(less[last]); + */ + + if (last_lcp < last || last_lcp == last+lcp[last]) { + int lcp; + if (last_lcp < last) lcp = 0; + else lcp = last_lcp - last; + while (suf+lcp != str_end && pivot+lcp != pivot_end + && suf[lcp]==pivot[lcp]) { + ++lcp; + } + /* + std::cout << " lcp=" << lcp + << " suf+lcp-str=" << suf+lcp-str + << " str_end-str=" << str_end-str + << " suf+lcp==str=" << (suf+lcp==str); + */ + result = (suf+lcp==str_end) + || (pivot+lcp==pivot_end ? dc_less(suf, pivot) : suf[lcp] < pivot[lcp]); + //std::cout << " result=" << result << std::endl; + last_lcp = lcp; + last_less = result; + ++suf; last = 1; + } else if (last_lcp > last+lcp[last]) { + result = !less[last]; + ++suf; ++last; + } else if (last_lcp < last+lcp[last]) { + result = last_less; + last_lcp -= last; + ++suf; last = 1; + } + + //std::cout << " result=" << result << std::endl; + return result; + } + + //81 0 212 cabcbacacabcbacacabcbacacabcbacacabcbaca + +private: + Iterator str; + Iterator str_end; + Iterator suf; + Iterator pivot; + Iterator pivot_end; + + int last; + int last_lcp; + bool last_less; + + std::vector lcp; + std::vector less; + + const DCSampler& dcsampler; + RankIterator dcranks; + + void preprocess() { + + int length = pivot_end-pivot; + + int i = 0; + int j = 1; + + /* + std::vector shift(length+1); + shift[0] = -1; + + while (true) { + while ( j < length && pivot[i] == pivot[j] ) { + shift[j] = shift[i]; + ++i; ++j; + } + shift[j] = i; + if (j == length) break; + do { + i = shift[i]; + } while ( i != -1 && pivot[i] != pivot[j] ); + if (i==-1) { ++i; ++j; } + } + + std::copy (shift.begin(), shift.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + */ + + lcp.resize(length+1); + less.resize(length+1); + + lcp[0] = length; + less[0] = 0; + i = 0; j = 1; + + while (j <= length) { + while ( j < length && pivot[i] == pivot[j] ) { + ++i; ++j; + } + if ( j == length || pivot[i] < pivot[j] ) { + lcp[j-i] = i; + less[j-i] = 1; + } else { + lcp[j-i] = i; + less[j-i] = 0; + } + int k = i; + do { + --i; + if (i == -1) { + ++i; ++j; + break; + } else if (lcp[k-i] == i) { + break; + } else if (lcp[k-i] < i) { + lcp[j-i] = lcp[k-i]; + less[j-i] = less[k-i]; + } else { + lcp[j-i] = i; // i == lcp[j-k] + less[j-i] = less[j-k]; + } + } while (true); + } + + /* + std::copy (lcp.begin(), lcp.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + std::copy (less.begin(), less.end(), + std::ostream_iterator(std::cout, " ")); + std::cout << std::endl; + */ + } + + +}; + + +/* + +gagcgag +gagcgag 7 0 + g 0 0 + ga 1 1 + g 0 0 + gag 3 1 + g 0 0 + g 1 1 + 0 1 + +aagagcgagta + +*/ diff --git a/dcover/partition.hpp b/dcover/partition.hpp new file mode 100644 index 0000000..6b72c94 --- /dev/null +++ b/dcover/partition.hpp @@ -0,0 +1,435 @@ +/* + * partition.hpp + * + * Copyright 2003 Max-Planck-Institut fuer Informatik + * + */ + +//====================================================================== +// A collection of flexible, generic, STL-like algorithm components +// for comparison-based distribution (i.e., quicksort-like) sorting. +//====================================================================== + +#ifndef PARTITION_HPP +#define PARTITION_HPP + +#include +#include +#include +#include +#include // for rand() + +//====================================================================== +// Most of the algorithms here have three versions. For example a sort +// function can have the following forms: +// sort(beg, end); +// sort(beg, end, less); +// sort(beg, end, less, key); +// The first two forms are standard STL. The third form adds an +// argument key, which is a unary function. Where the second +// form would compare two iterators i and j as less(*i, *j), +// the third form does less(key(i), key(j)) instead. This allows +// moving some of the computation related to the comparison from +// less() into key(). When the same iterator is used in many +// comparisons (for example the pivot), key() needs to be evaluated +// only once. +// +// For example, to sort objects of type T with a member function +// size() returning int into increasing order of size() do +// (TODO: test/expand this, use vector of vectors) +// +// sort(beg, end, std::less(), std::mem_fun(&T::size)); +// +// key() may return a pointer or reference to (a member of) the object. +// All the algorithms assume that the value returned by key() stays +// valid only as long as the object is not moved. +// +// TYPE REQUIREMENTS +// For example: +// template +// void sort(Iterator, Iterator, Compare, KeyAccess); +// * KeyAccess is a model of Adaptable Unary Function +// * Iterator is convertible to KeyAccess::argument_type +// * KeyAccess::result_type is convertible to the argument type +// of Compare. +//====================================================================== + +//---------------------------------------------------------------------- +// functor copy_dereference +// functor const_dereference +// +// Default functors to be used as key(). +// (TODO: which one should be used as the default?) +// (TODO: is const_dereference::result_type a problem?) +//---------------------------------------------------------------------- +template +class const_dereference + : public std::unary_function::value_type &> +{ +public: + const typename std::iterator_traits::value_type & + operator() (Iterator i) const { return *i; } +}; + +template +class copy_dereference + : public std::unary_function::value_type> +{ +public: + typename std::iterator_traits::value_type + operator() (Iterator i) const { return *i; } +}; + +template +class default_key : public copy_dereference {}; + + + +//====================================================================== +// +// Pivot selection +// +// (TODO: non-randomized pivot selectors) +//====================================================================== + +//---------------------------------------------------------------------- +// function iter_median3 +// +// Median of three +//---------------------------------------------------------------------- +template +Iterator +iter_median3(Iterator a, Iterator b, Iterator c, + Compare less, KeyAccess key) { + typedef typename KeyAccess::result_type key_type; + key_type ka = key(a), kb = key(b), kc = key(c); + if (less(ka, kb)) + if (less(kb, kc)) return b; + else if (less(ka, kc)) return c; + else return a; + else if (less(ka, kc)) return a; + else if (less(kb, kc)) return c; + else return b; +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template +Iterator +iter_median3(Iterator a, Iterator b, Iterator c, Compare less) { + return iter_median3(a, b, c, less, default_key()); +} + +template +Iterator +iter_median3(Iterator a, Iterator b, Iterator c) { + typedef typename std::iterator_traits::value_type key_type; + return iter_median3(a, b, c, std::less(), + default_key()); +} + + +//---------------------------------------------------------------------- +// function random_pivot +// +// Choose pivot of as a median of three random elements +// (or as a single random element for small ranges) +// +// (TODO: better/faster random number generation?) +//---------------------------------------------------------------------- +template +RandomAccessIterator +random_pivot (RandomAccessIterator beg, RandomAccessIterator end, + Compare less, KeyAccess key) +{ + static double scale_to_one = 1.0L/(RAND_MAX+1.0); + + typedef typename std::iterator_traits::difference_type + difference_type; + difference_type size = std::distance(beg, end); + double scale_to_size = size * scale_to_one; + RandomAccessIterator pivot; + pivot = beg + static_cast(scale_to_size*std::rand()); + if (size > 50) { + RandomAccessIterator pivot2, pivot3; + pivot2 = beg + static_cast(scale_to_size*std::rand()); + pivot3 = beg + static_cast(scale_to_size*std::rand()); + pivot = iter_median3(pivot, pivot2, pivot3, less, key); + } + return pivot; +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template +RandomAccessIterator +random_pivot(RandomAccessIterator beg, RandomAccessIterator end, + Compare less) +{ + return random_pivot(beg, end, less, + default_key()); +} + +template +RandomAccessIterator +random_pivot(RandomAccessIterator beg, RandomAccessIterator end) +{ + typedef typename std::iterator_traits::value_type + key_type; + return random_pivot(beg, end, std::less(), + default_key()); +} + + + + +//====================================================================== +// Partitioning +//====================================================================== + +//---------------------------------------------------------------------- +// function binary_partition +// +// Partitions the input range [beg,end) into two parts [beg,mid) and +// [mid,end) with all elements smaller than pivot in [beg, mid) and +// all elements larger than pivot in [mid,end). Equal elements may +// be in either subrange. Returns mid. +// PRECONDITION: pivot is in the range [beg,end) +//---------------------------------------------------------------------- +template +RandomAccessIterator +binary_partition(RandomAccessIterator beg, RandomAccessIterator end, + RandomAccessIterator pivot, Compare less, KeyAccess key) +{ + if (beg == --end) return beg; + + // move the pivot to a safe place and setup sentinels + std::swap(*beg, *pivot); + if (less(key(end), key(beg))) { + std::swap(*beg, *end); + pivot = end; + } else { + pivot = beg; + } + + // now the pivot won't move anymore + typename KeyAccess::result_type pivot_key = key(pivot); + + while (true) { + do { ++beg; } while (less(key(beg), pivot_key)); + do { --end; } while (less(pivot_key, key(end))); + if (!(beg < end)) return beg; + std::swap(*beg, *end); + } +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template +RandomAccessIterator +binary_partition(RandomAccessIterator beg, RandomAccessIterator end, + RandomAccessIterator pivot, Compare less) +{ + return binary_partition(beg, end, pivot, less, + default_key()); +} + +template +RandomAccessIterator +binary_partition(RandomAccessIterator beg, RandomAccessIterator end, + RandomAccessIterator pivot) +{ + typedef typename std::iterator_traits::value_type + key_type; + return binary_partition(beg, end, pivot, std::less(), + default_key()); +} + + +//---------------------------------------------------------------------- +// function ternary_partition +// +// Partitions the input range [beg,end) into three parts according to +// comparisons to the pivot: | less | equal | greater | +// Returns the middle (equal) range. +// PRECONDITION: pivot is in the range [beg,end) +// +// METHOD +// The algorithm maintains the following invariant: +// +// | equal | less | unknown | greater | equal | +// ^ ^ ^ ^ ^ ^ +// beg first_less left right last_greater end +// +//---------------------------------------------------------------------- +template +std::pair +ternary_partition(RandomAccessIterator beg, RandomAccessIterator end, + RandomAccessIterator pivot, Compare less, + KeyAccess key) +{ + RandomAccessIterator left = beg, right = end-1; + RandomAccessIterator first_less = left, last_greater = right; + + // first move pivot to a safe place and setup sentinels + std::iter_swap(pivot, left); + if (less(key(right), key(left))) { + std::iter_swap(left, right); + pivot = right; + ++left; --right; --last_greater; + } else { + pivot = left; + ++left; ++first_less; + } + + // now the pivot won't move anymore + typename KeyAccess::result_type pivot_key = key(pivot); + + while(true) { + while(less(key(left), pivot_key)) ++left; + while (!less(pivot_key, key(left)) && left < right) { + std::iter_swap(left++, first_less++); + while(less(key(left), pivot_key)) ++left; + } + while(less(pivot_key, key(right))) --right; + while (!less(key(right), pivot_key) && left < right) { + std::iter_swap(right--, last_greater--); + while(less(pivot_key, key(right))) --right; + } + if (!(left < right)) break; + std::iter_swap(left++, right--); + } + + // now the situation is this: + // + // | equal | less | greater | equal | + // ^ ^ ^ ^ ^ ^ + // beg first_less right left last_greater end + // + // or this: + // + // | equal | less |=| greater | equal | + // ^ ^ ^ ^ ^ + // beg first_less left last_greater end + // right + // + // next: swap equal parts to the middle + + typename std::iterator_traits::difference_type size; + size = std::min(first_less-beg, left-first_less); + std::swap_ranges(beg, beg+size, left-size); + size = std::min(end-1-last_greater, last_greater-right); + std::swap_ranges(right+1, right+1+size, end-size); + return std::make_pair(beg+(left-first_less), end-(last_greater-right)); +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template +std::pair +ternary_partition(RandomAccessIterator beg, RandomAccessIterator end, + RandomAccessIterator pivot, Compare less) +{ + return ternary_partition(beg, end, pivot, less, + default_key()); +} + +template +std::pair +ternary_partition(RandomAccessIterator beg, RandomAccessIterator end, + RandomAccessIterator pivot) +{ + typedef typename std::iterator_traits::value_type + key_type; + return ternary_partition(beg, end, pivot, std::less(), + default_key()); +} + + +//====================================================================== +// Sorting +//====================================================================== + +//---------------------------------------------------------------------- +// function insertion_sort +// +//---------------------------------------------------------------------- +template +void +insertion_sort(RandomAccessIterator beg, RandomAccessIterator end, + Compare less, KeyAccess key) +{ + if (end-beg <= 1) return; + for (RandomAccessIterator i = beg+1; i != end; ++i) { + typename KeyAccess::result_type i_key = key(i); // i is not moved ... + RandomAccessIterator j = i-1; + if (!less(i_key, key(j))) continue; + for ( ; j != beg; --j) { + if (!less(i_key, key(j-1))) break; + std::swap(*(j-1), *j); + } + std::swap(*i, *j); // ... until here + } +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template +void +insertion_sort(RandomAccessIterator beg, RandomAccessIterator end, + Compare less) +{ + insertion_sort(beg, end, less, default_key()); +} + +template +void +insertion_sort(RandomAccessIterator beg, RandomAccessIterator end) +{ + typedef typename std::iterator_traits::value_type + key_type; + insertion_sort(beg, end, std::less(), + default_key()); +} + + +//---------------------------------------------------------------------- +// function quicksort +// +//---------------------------------------------------------------------- +template +void +quicksort(RandomAccessIterator beg, RandomAccessIterator end, + Compare less, KeyAccess key) +{ + while (end-beg > 15) { + RandomAccessIterator pivot = random_pivot(beg, end, less, key); + RandomAccessIterator mid = binary_partition(beg, end, pivot, less, key); + quicksort(beg, mid, less, key); + beg = mid; + } + insertion_sort(beg, end, less, key); +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + +template +void +quicksort(RandomAccessIterator beg, RandomAccessIterator end, Compare less) +{ + quicksort(beg, end, less, default_key()); +} + +template +void +quicksort(RandomAccessIterator beg, RandomAccessIterator end) +{ + typedef typename std::iterator_traits::value_type + key_type; + quicksort(beg, end, std::less(), + default_key()); +} + + +#endif // PARTITION_HPP diff --git a/dcover/stringsort.hpp b/dcover/stringsort.hpp new file mode 100644 index 0000000..faef78f --- /dev/null +++ b/dcover/stringsort.hpp @@ -0,0 +1,337 @@ +/* + * stringsort.hpp + * + * Copyright 2003 Max-Planck-Institut fuer Informatik + * + */ + +//====================================================================== +// Multikey quicksort for suffixes +//====================================================================== + +#ifndef STRINGSORT_HPP +#define STRINGSORT_HPP + +#include "partition.hpp" + +#include +#include +#include +#include + + +//---------------------------------------------------------------------- +// function string_compare +// +// Compare two strings until a mismatch but no longer than limit +// characters. The sign of the result indicates the order of the +// strings and the absolute value the distance of the mismatch from +// the limit. Zero means that no mismatch was found within limit +// characters. +// +// WARNING: A check for the end of a string is user's responsibility. +//---------------------------------------------------------------------- +template +DifferenceType +string_compare(StringIterator s1, StringIterator s2, + DifferenceType limit, CharCompare less) { + for ( ; limit > 0; --limit, ++s1, ++s2) { + if (less(*s1,*s2)) return -limit; + if (less(*s2,*s1)) return limit; + } + return 0; +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +template +inline DifferenceType +string_compare(StringIterator s1, StringIterator s2, DifferenceType limit) +{ + typedef typename std::iterator_traits::value_type + char_type; + return string_compare(s1, s2, limit, std::less()); +} + + +//---------------------------------------------------------------------- +// const int suffix_insertion_sort_threshold_ +// +// Groups of suffixes with size less or equal to this value are +// sorted by insertion sort. +//---------------------------------------------------------------------- +static const int suffix_insertion_sort_threshold_ = 15; + +//---------------------------------------------------------------------- +// function suffix_insertion_sort +// +// Sort suffixes using insertion sort. +// A difference from a straightforward insertion sort is that +// the lengths of longest common prefixes of the already sorted +// suffixes are stored and used in later insertions. +//---------------------------------------------------------------------- +template +void +suffix_insertion_sort(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + typename std::iterator_traits::difference_type limit, + GroupHandler handle_unsorted_group, + CharCompare less) +{ + typedef typename std::iterator_traits::difference_type + str_diff_type; + typedef typename std::iterator_traits::difference_type + sa_diff_type; + typedef typename std::iterator_traits::value_type + pos_type; + + static std::vector lcp(suffix_insertion_sort_threshold_); + lcp.resize(sa_end-sa); + lcp.back() = -1; + + for (sa_diff_type i = sa_end-sa-2; i >= 0; --i) { + pos_type i_pos = sa[i]; + StringIterator i_str = str + i_pos; + StringIterator j_str = str + sa[i+1]; + str_diff_type i_limit = std::min(limit, str_end - i_str); + str_diff_type ij_limit = std::min(i_limit, str_end - j_str); + str_diff_type dist = string_compare(i_str, j_str, ij_limit, less); + if (dist < 0 || (dist == 0 && i_limit == ij_limit)) { + lcp[i] = ij_limit + dist; + } else { + sa[i] = sa[i+1]; + str_diff_type i_lcp = ij_limit - dist; + + sa_diff_type j = i+1; + for ( ; lcp[j] >= i_lcp; ++j) { + if (lcp[j] == i_lcp) { + j_str = str+sa[j+1]; + ij_limit = std::min(i_limit, str_end-j_str); + dist = string_compare(i_str+i_lcp, j_str+i_lcp, + ij_limit-i_lcp, less); + if (dist < 0 || (dist == 0 && i_limit == ij_limit)) { + lcp[j] = ij_limit + dist; + break; + } else { + i_lcp = ij_limit - dist; + } + } + sa[j] = sa[j+1]; + lcp[j-1] = lcp[j]; + } + + sa[j] = i_pos; + lcp[j-1] = i_lcp; + } + } + + // find and process unsorted groups + typename std::vector::iterator beg, end, last = lcp.end()-1; + for (beg = lcp.begin(); beg < last; ++beg) { + if (*beg == limit) { + end = beg; + do { ++end; } while (*end == limit); + handle_unsorted_group(sa+(beg-lcp.begin()), sa+(end-lcp.begin())+1); + beg = end; + } + } +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +template +void +suffix_insertion_sort(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + typename std::iterator_traits::difference_type limit, + GroupHandler handle_unsorted_group) +{ + typedef typename std::iterator_traits::value_type + char_type; + suffix_insertion_sort(str, str_end, sa, sa_end, limit, + handle_unsorted_group, std::less()); +} + + +//---------------------------------------------------------------------- +// functor char_access +// +// Compute the character that represents a suffix in comparisons. +//---------------------------------------------------------------------- +template +class char_access + : public std::unary_function::value_type> +{ +private: + const StringIterator str; +public: + char_access(StringIterator s) : str(s) {} + + typename std::iterator_traits::value_type + operator() (SArrayIterator i) const { + return str[*i]; + } +}; + + +//====================================================================== +// function suffix_mkqsort +// function suffix_mkqsort_noempty +// +// Sort suffixes using multikey quicksort. +// These two functions call each other to implement multikey quicksort. +// Either can be used as the initial call. suffix_mkqsort_noempty +// places more restrictions on the input but can be slightly faster. +//====================================================================== + +//---------------------------------------------------------------------- +// function suffix_mkqsort +// +// Sort suffixes using multikey quicksort. +// This is the more general form. +//---------------------------------------------------------------------- +template +void +suffix_mkqsort(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + typename std::iterator_traits::difference_type limit, + GroupHandler handle_unsorted_group, + CharCompare less) +{ + typedef typename std::iterator_traits::value_type pos_type; + + while (sa_end-sa > suffix_insertion_sort_threshold_ && limit > 0) { + + // check for empty suffix(es) + pos_type length = static_cast(str_end-str); + sa = std::partition(sa, sa_end, + std::bind2nd(std::equal_to(), length)); + + // this would check only for one empty suffix + //SArrayIterator empty = std::find(sa, sa_end, length); + //if (empty != sa_end) { + // std::swap(*sa, *empty); + // ++sa; + //} + + // partition + char_access key(str); + SArrayIterator pivot = random_pivot(sa, sa_end, less, key); + std::pair midrange; + midrange = ternary_partition(sa, sa_end, pivot, less, key); + + // recurse on <-part + if (midrange.first - sa > 1) { + suffix_mkqsort_noempty(str, str_end, sa, midrange.first, + limit, handle_unsorted_group, less); + } + + // recurse on >-part + if (sa_end - midrange.second > 1) { + suffix_mkqsort_noempty(str, str_end, midrange.second, sa_end, + limit, handle_unsorted_group, less); + } + + // loop on =-part + sa = midrange.first; + sa_end = midrange.second; + + // move to the next position in the suffixes + ++str; + --limit; + + } + + if (sa_end-sa > 1) { + if (limit == 0) { + handle_unsorted_group(sa, sa_end); + } else { + suffix_insertion_sort(str, str_end, sa, sa_end, limit, + handle_unsorted_group, less); + } + } +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +template +void +suffix_mkqsort(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + typename std::iterator_traits::difference_type limit, + GroupHandler handle_unsorted_group) +{ + typedef typename std::iterator_traits::value_type + char_type; + suffix_mkqsort(str, str_end, sa, sa_end, limit, + handle_unsorted_group, std::less()); +} + + +//---------------------------------------------------------------------- +// function suffix_mkqsort_noempty +// +// Sort suffixes using multikey quicksort. +// Used as a subroutine of suffix_mkqsort but can also be called +// directly. Places more restrictions on input (see below) but +// can be slightly faster than suffix_mkqsort. +// +// The additional restrictions: +// PRECONDITION: limit>0 and there are no empty suffixes +//---------------------------------------------------------------------- +template +void +suffix_mkqsort_noempty(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + typename std::iterator_traits::difference_type limit, + GroupHandler handle_unsorted_group, + CharCompare less) +{ + typedef typename std::iterator_traits::value_type pos_type; + + while (sa_end-sa > suffix_insertion_sort_threshold_) { + + // partition + char_access key(str); + SArrayIterator pivot = random_pivot(sa, sa_end, less, key); + std::pair midrange; + midrange = ternary_partition(sa, sa_end, pivot, less, key); + + // recurse on <-part + if (midrange.first - sa > 1) { + suffix_mkqsort_noempty(str, str_end, sa, midrange.first, + limit, handle_unsorted_group, less); + } + + // recurse on =-part + if (midrange.second - midrange.first > 1) { + suffix_mkqsort(str+1, str_end, midrange.first, midrange.second, + limit-1, handle_unsorted_group, less); + } + + // loop on >-part + sa = midrange.second; + } + + if (sa_end-sa > 1) { + suffix_insertion_sort(str, str_end, sa, sa_end, limit, + handle_unsorted_group, less); + } +} +// - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - +template +void +suffix_mkqsort_noempty(StringIterator str, StringIterator str_end, + SArrayIterator sa, SArrayIterator sa_end, + typename std::iterator_traits::difference_type limit, + GroupHandler handle_unsorted_group) +{ + typedef typename std::iterator_traits::value_type + char_type; + suffix_mkqsort_noempty(str, str_end, sa, sa_end, limit, + handle_unsorted_group, std::less()); +} + +#endif // STRINGSORT_HPP