4 * Copyright 2004 Juha Ka"rkka"inen <juha.karkkainen@cs.helsinki.fi>
8 #include "dcsuffixsort.hpp"
9 #include "dcsamplesort.hpp"
10 #include "stringsort.hpp"
11 #include "difference_cover.hpp"
12 #include "distribute.hpp"
13 #include "kmpsplit.hpp"
15 #include "doubling.hpp"
38 #define OVERSAMPLING_RATE 10.0
42 template <typename Iterator>
43 void operator() (Iterator, Iterator) {}
49 template <class StringIterator, class OutputIterator>
50 typename std::iterator_traits<StringIterator>::difference_type
51 compute_bwt (StringIterator str, StringIterator str_end,
52 OutputIterator bwt, unsigned endmarkers, unsigned int dcover = DCOVER)
55 typedef typename std::iterator_traits<StringIterator>::difference_type
56 difftype; // signed integral type
57 typedef unsigned long sizetype; // unsigned integral type
58 typedef difftype postype; // type for representing vector positions
60 typedef typename std::vector<postype>::iterator pos_iterator;
63 std::cout << "BWT construction starts" << std::endl;
70 sizetype length = std::distance(str, str_end);
73 typedef typename std::iterator_traits<StringIterator>::value_type
75 typedef typename std::iterator_traits<OutputIterator>::value_type
77 std::cout << " text memory=" << length*sizeof(chartype) << "\n"
78 << " bwt memory=" << (length+1)*sizeof(chartype) << std::endl;
81 dc_sampler<postype> dcsampler(length+1, dcover);
84 std::cout << " dcsampler memory="
85 << (dcsampler.packed_period() + 2*dcsampler.period())
90 sizetype blocksize = dcsampler.samplesize();
91 sizetype npivots = MIN_PIVOTS;
93 // make average bucket size <= blocksize/OVERSAMPLING_RATE
94 while ((OVERSAMPLING_RATE*length)/(npivots+1.0) > blocksize) ++npivots;
97 std::cout << " length=" << length
98 << " samplesize=" << dcsampler.samplesize()
99 << " blocksize=" << blocksize
100 << " npivots=" << npivots
104 if (dcsampler.period() + 2*npivots > length) {
105 // too small text for sampling
108 std::cout << "small text: sort all suffixes\n";
111 std::vector<postype> sa(length);
112 for (pos_iterator i=sa.begin(); i!=sa.end(); ++i) {
115 suffix_mkqsort(str, str_end, sa.begin(), sa.end(), length, do_nothing());
116 difftype eof_position = 0;
117 for (pos_iterator i=sa.begin(); i!=sa.end(); ++i) {
119 if (str[*i-1] <= endmarkers)
122 *bwt++ = str[*i-1] - endmarkers;
124 if (str[*(i-1)-1] <= endmarkers)
127 *bwt++ = str[*(i-1)-1] - endmarkers;
128 eof_position = i - sa.begin() +1; // FIXME
134 // don't take too short suffixes as pivots
135 unsigned int pivotrange = length-dcsampler.period()+1;
137 //-----------------------------------
138 // setup difference cover sample
139 //-----------------------------------
142 std::cout << "difference cover construction ... " << std::endl;
145 sizetype samplesize = dcsampler.samplesize();
146 std::vector<difftype> dcranks(samplesize);
147 std::vector<difftype> suffixes(samplesize);
150 std::cout << "dcranks memory="
151 << samplesize * sizeof(difftype)
153 std::cout << "suffixes memory="
154 << samplesize * sizeof(difftype)
158 sort_dc_sample(dcsampler, str, str_end,
159 dcranks.begin(), dcranks.end(),
160 suffixes.begin(), suffixes.end());
163 std::cout << "sample ranks:\n";
164 std::copy(dcranks.begin(), dcranks.end(),
165 std::ostream_iterator<int>(std::cout,"\n"));
166 std::cout << "end of sample ranks" << std::endl;
171 //-------------------
173 //-------------------
176 std::cout << "setting up pivots ..." << std::endl;
179 std::set<difftype> pivotset;
180 double scale_factor = (pivotrange * 1.0L)/(RAND_MAX+1.0L);
181 std::srand(RANDSEED);
182 while (pivotset.size() < npivots) {
184 int newpivot = static_cast<difftype>(scale_factor*r);
185 pivotset.insert(newpivot);
189 std::cout << " pivotset memory = " << pivotset.size()
190 << " elements" << std::endl;
193 std::vector<difftype> pivots(pivotset.begin(), pivotset.end());
197 dc_sort_suffixes(str, str_end, pivots.begin(), pivots.end(),
198 dcsampler, dcranks.begin(), dcranks.end());
201 std::cout << "pivots sorted lexicographically:\n";
202 for (pos_iterator i = pivots.begin(); i != pivots.end(); ++i) {
203 std::cout << std::setw(8) << *i << " ";
204 std::copy(str+*i, std::min(str_end, str+*i+40),
205 std::ostream_iterator<char>(std::cout,""));
206 std::cout << std::endl;
212 std::cout << " constructing distributor ... " << std::endl;
215 suffix_distributor<StringIterator, pos_iterator,
216 dc_sampler<postype>, pos_iterator>
217 distributor(str, str_end, pivots.begin(), pivots.end(),
218 dcsampler, dcranks.begin());
226 //------------------------
227 // compute bucketsizes
228 //------------------------
231 std::cout << "computing bucket sizes ... " << std::endl;
234 std::vector<difftype> buckets(distributor.nbuckets());
237 std::cout << " number of buckets: " << distributor.nbuckets() << "\n";
238 std::cout << " bucket vector memory="
239 << buckets.size() * sizeof(difftype)
243 for (StringIterator suf = str; suf != str_end; ++suf) {
244 unsigned int bucket = distributor.find_bucket(suf);
248 std::cout << " into bucket " << bucket;
249 std::cout << ":" << std::setw(7) << suf-str << " ";
250 std::copy(suf, std::min(str_end, suf+40),
251 std::ostream_iterator<char>(std::cout,""));
252 std::cout << std::endl;
256 // ++buckets[0]; // empty suffix
259 std::cout << " bucket sizes: ";
260 std::copy(buckets.begin(), buckets.end(),
261 std::ostream_iterator<int>(std::cout," "));
262 std::cout << std::endl;
266 //-----------------------
267 // build and sort blocks
268 //-----------------------
271 std::cout << "Computing and sorting blocks ... " << std::endl;
274 // *bwt++ = *(str_end-1); //FIXME no empty suffix?
276 postype eof_position = 0;
277 sizetype total_sum = 1;
279 postype bucket_begin = 0;
280 while (bucket_begin < (postype)buckets.size()) {
282 sizetype sum = buckets[bucket_begin];
283 postype bucket_end = bucket_begin + 1;
284 while (bucket_end < (postype)buckets.size()
285 && sum+buckets[bucket_end] <= blocksize) {
286 sum += buckets[bucket_end];
290 postype left_pivot = bucket_begin-1;
291 postype right_pivot = bucket_end-1;
294 if (left_pivot >= 0 && right_pivot < (postype)pivots.size()) {
296 lcp = suffix_lcp(str+pivots[left_pivot], str+pivots[right_pivot],
297 str_end, 0l, dcsampler.period()-1);
299 lcp = suffix_lcp(str+pivots[left_pivot], str+pivots[right_pivot],
300 str_end, 0l, dcsampler.period()-1);
305 std::cout << "block [" << bucket_begin << "," << bucket_end << ")"
311 if (sum > blocksize) {
313 suffixes.resize(blocksize);
316 std::cout << " blocksize increased to "
317 << blocksize << "\n";
318 std::cout << " suffixes memory resized to "
319 << blocksize*sizeof(postype)
325 pos_iterator i = suffixes.begin();
326 //if (bucket_begin == 0) { *i++ = length; // empty suffix }
328 postype left_pivot_not_beyond = left_pivot<0 ? 0 : left_pivot;
329 kmp_split<StringIterator, dc_sampler<postype>, pos_iterator>
330 left_cmp(str, str_end, str+pivots[left_pivot_not_beyond],
331 dcsampler, dcranks.begin());
333 postype right_pivot_not_beyond =
334 right_pivot==(postype)pivots.size() ? right_pivot-1 : right_pivot;
335 kmp_split<StringIterator, dc_sampler<postype>, pos_iterator>
336 right_cmp(str, str_end, str+pivots[right_pivot_not_beyond],
337 dcsampler, dcranks.begin());
339 for (StringIterator suf = str; suf != str_end; ++suf) {
342 std::cout << ":" << std::setw(3) << suf-str << " ";
343 std::copy(suf, std::min(str_end, suf+40),
344 std::ostream_iterator<char>(std::cout,""));
345 std::cout << std::endl;
349 // is the suffix left of the block
350 if (left_pivot >=0) {
351 postype pivotpos = pivots[left_pivot];
352 if (*suf < str[pivotpos]) continue;
353 int lcp = suffix_lcp(suf, str+pivotpos,
354 str_end, 0, dcsampler.period()-1);
355 if (lcp < dcsampler.period()-1) {
356 if (suf+lcp == str_end || suf[lcp] < (str+pivotpos)[lcp]) {
360 postype sufpos = suf-str;
361 std::pair<postype,postype> result
362 = dcsampler.pair(sufpos, pivotpos);
363 if (dcranks[result.first] < dcranks[result.second]){
369 // is the suffix right of the block
370 if (right_pivot < (postype)pivots.size()) {
371 postype pivotpos = pivots[right_pivot];
372 if (*suf > str[pivotpos]) continue;
373 int lcp = suffix_lcp(str+pivotpos, suf,
374 str_end, 0, dcsampler.period()-1);
375 if (lcp < dcsampler.period()-1) {
376 if (suf+lcp < str_end && (str+pivotpos)[lcp] < suf[lcp]) {
380 postype sufpos = suf-str;
381 std::pair<postype,postype> result
382 = dcsampler.pair(pivotpos, sufpos);
383 if (dcranks[result.first] <= dcranks[result.second]){
390 //postype bucket = distributor.find_bucket(suf);
392 bool to_left = left_pivot>0 && left_cmp.is_next_less();
393 bool to_right = right_pivot<(postype)pivots.size() && !right_cmp.is_next_less() ;
395 if (to_left || to_right) {
396 //assert (bucket_begin > bucket || bucket >= bucket_end);
400 // it is in the bucket
401 //postype bucket = distributor.find_bucket(suf);
403 //if (bucket_begin > bucket || bucket >= bucket_end) continue;
405 //assert (bucket_begin <= bucket && bucket < bucket_end);
409 //std::cout << "into bucket " << bucket;
410 std::cout << ":" << std::setw(3) << suf-str << " ";
411 std::copy(suf, std::min(str_end, suf+40),
412 std::ostream_iterator<char>(std::cout,""));
413 std::cout << std::endl;
418 assert(i == suffixes.begin()+sum);
420 dc_sort_suffixes(str, str_end, suffixes.begin(), i,
421 dcsampler, dcranks.begin(), dcranks.end());
424 std::cout << "sorted bucket:\n";
425 for (pos_iterator j = suffixes.begin(); j != i; ++j) {
426 std::cout << std::setw(3) << *j << " ";
427 std::copy(str+*j, std::min(str_end, str+*j+40),
428 std::ostream_iterator<char>(std::cout,""));
429 std::cout << std::endl;
433 for (pos_iterator j=suffixes.begin(); j != i; ++j) {
435 if (str[*j-1] <= endmarkers)
438 *bwt++ = str[*j-1] - endmarkers;
440 if (str[*(j-1)-1] <= endmarkers)
443 *bwt++ = str[*(j-1)-1] - endmarkers;
444 eof_position = total_sum + (j - suffixes.begin());
450 bucket_begin = bucket_end;
453 assert(total_sum == length+1);