4 * Copyright 2004 Juha K"arkk"ainen <juha.karkkainen@cs.helsinki.fi>
11 #include "dcsuffixsort.hpp"
18 //----------------------------------------------------------------------
21 // Compute the length of the longest common prefix between two
23 // str_end = end of the text
24 // minlcp = length of the already known common prefix
25 // maxlcp = stop when the length reaches maxlcp
26 //----------------------------------------------------------------------
27 template <class StringIterator, class LcpType>
29 suffix_lcp(StringIterator a, StringIterator b, StringIterator str_end,
30 LcpType minlcp, LcpType maxlcp)
32 if (a < b) swap(a,b); // make a the shorter suffix
33 if ((str_end-a) > maxlcp) str_end = a + maxlcp;
34 std::pair<StringIterator,StringIterator> mm =
35 std::mismatch(a+minlcp, str_end, b+minlcp);
36 //std::cout << " lcp=" << mm.first-a << "\n";
37 return static_cast<LcpType>(mm.first-a);
41 //----------------------------------------------------------------------
42 // class suffix_distributor
43 //----------------------------------------------------------------------
44 template <class StringIterator, class SuffixIterator,
45 class DCSampler, class RankIterator>
46 class suffix_distributor
49 //typedef typename std::iterator_traits<StringIterator>::difference_type
51 typedef short lcptype;
53 std::iterator_traits<SuffixIterator>::difference_type difftype;
56 StringIterator str, str_end;
57 SuffixIterator pivots, pivots_end;
58 std::vector<lcptype> lcps;
59 const DCSampler& dcsampler;
63 typedef typename std::vector<lcptype>::iterator lcpiterator;
67 suffix_distributor(StringIterator str_, StringIterator str_end_,
68 SuffixIterator pivots_, SuffixIterator pivots_end_,
69 const DCSampler& dcsampler_, RankIterator dcranks_)
70 : str(str_), str_end(str_end_), pivots(pivots_), pivots_end(pivots_end_),
71 lcps(pivots_end-pivots), dcsampler(dcsampler_), dcranks(dcranks_),
72 maxlcp(dcsampler.period())
77 void reset(SuffixIterator pivots_, SuffixIterator pivots_end_)
80 pivots_end=pivots_end_;
81 lcps.resize(pivots_end-pivots);
87 difftype nbuckets() const{ return pivots_end-pivots+1; }
89 difftype find_bucket(StringIterator suf) const {
90 return binary_search(suf);
97 std::cout << "distributor memory="
98 << lcps.size()*sizeof(lcptype)
101 std::cout << "pivots and lcps:\n";
102 for (SuffixIterator i = pivots; i != pivots_end; ++i) {
103 std::cout << std::setw(4) << i-pivots
104 << std::setw(4) << int(lcps[i-pivots])
105 << std::setw(9) << *i << " ";
106 std::copy(str+*i, std::min(str_end, str+*i+40),
107 std::ostream_iterator<char>(std::cout,""));
110 std::cout << "distributor end\n";
118 bool dc_less_or_equal(difftype a, difftype b) const {
119 std::pair<difftype,difftype> rep = dcsampler.pair(a, b);
120 return dcranks[rep.first] <= dcranks[rep.second];
123 //----------------------------------------------------------------------
124 // compute_lcps_bounded
126 // Compute all the lcp values in a subrange bounded by
127 // actual pivot suffixes.
128 //----------------------------------------------------------------------
130 compute_lcps_bounded(SuffixIterator left_suf, SuffixIterator right_suf,
131 lcpiterator left_lcp,
132 lcptype minlcp, lcptype maxlcp)
134 if (left_suf == right_suf) return;
135 difftype midpoint = (right_suf-left_suf)/2;
136 //std::cout << " midpoint of [0," << right_suf-left_suf
137 // << ")=" << midpoint << "\n";
139 SuffixIterator mid_suf = left_suf + midpoint;
140 lcpiterator mid_lcp = left_lcp + midpoint;
142 lcptype llcp_left = suffix_lcp(str+*(left_suf-1), str+*mid_suf,
143 str_end, minlcp, maxlcp);
144 lcptype llcp_right = suffix_lcp(str+*mid_suf, str+*right_suf,
145 str_end, minlcp, maxlcp);
146 *mid_lcp = llcp_right - llcp_left;
147 //std::cout << *mid_lcp << std::endl;
149 compute_lcps_bounded(left_suf, mid_suf, left_lcp, llcp_left, maxlcp);
150 compute_lcps_bounded(mid_suf+1, right_suf, mid_lcp+1, llcp_right, maxlcp);
154 //----------------------------------------------------------------------
157 // Compute all the lcp values in the full range.
158 //----------------------------------------------------------------------
162 if (pivots == pivots_end) return;
163 difftype centerpoint = (pivots_end - pivots)/2;
164 //std::cout << "midpoint of [0," << (pivots_end - pivots) << ")="
165 // << centerpoint << "\n";
166 lcps[centerpoint] = 0;
168 // unbounded ranges on the left
169 difftype rightpoint = centerpoint;
170 while (rightpoint > 0) {
171 difftype midpoint = rightpoint/2;
172 //std::cout << "midpoint of [0," << rightpoint << ")=" << midpoint << "\n";
173 lcptype llcp = suffix_lcp(str+pivots[midpoint],
174 str+pivots[rightpoint],
175 str_end, static_cast<lcptype>(0), maxlcp);
176 lcps[midpoint] = llcp;
177 //std::cout << llcp << std::endl;
178 compute_lcps_bounded(pivots+midpoint+1, pivots+rightpoint,
179 lcps.begin()+midpoint+1, llcp, maxlcp);
180 rightpoint = midpoint;
183 // unbounded ranges on the right
184 SuffixIterator left_suf = pivots + centerpoint + 1;
185 lcpiterator left_lcp = lcps.begin() + centerpoint + 1;
186 while (left_suf != pivots_end) {
187 difftype midpoint = (pivots_end - left_suf)/2;
188 //std::cout << "midpoint of [" << (left_suf-pivots) << "," <<
189 // (pivots_end-pivots) << ")=" << (left_suf-pivots)+midpoint << "\n";
190 lcptype llcp = suffix_lcp(str+left_suf[-1], str+left_suf[midpoint],
191 str_end, static_cast<lcptype>(0), maxlcp);
192 left_lcp[midpoint] = -llcp;
193 //std::cout << -llcp << std::endl;
194 compute_lcps_bounded(left_suf, left_suf+midpoint, left_lcp,
196 left_suf += midpoint + 1;
197 left_lcp += midpoint + 1;
206 //----------------------------------------------------------------------
208 //----------------------------------------------------------------------
211 binary_search(StringIterator query) const
213 //SuffixIterator left = pivots;
214 //SuffixIterator right = pivots_end;
215 //SuffixIterator mid;
218 difftype right = pivots_end-pivots;
222 lcptype lcp_diff = 0;
223 lcptype lcp_left, lcp_right;
228 std::cout << "middle minlcp=" << int(minlcp)
229 << " lcp_diff=" << int(lcp_diff)
230 << " mid=" << left + (right-left)/2
233 if (left == right) return left;
234 mid = left + (right-left)/2;
235 if (str+pivots[mid] == query) return mid+1;
236 lcptype lcp_mid = lcps[mid];
238 lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end,
240 if (newlcp == maxlcp) {
241 lcp_left = lcp_right = minlcp;
244 if (query+newlcp == str_end ||
245 query[newlcp] < str[pivots[mid]+newlcp]) {
247 lcp_diff = newlcp - minlcp;
248 if (lcp_diff != 0) goto closer_to_right;
251 lcp_diff = minlcp - newlcp;
252 if (lcp_diff != 0) goto closer_to_left;
255 if (lcp_mid < lcp_diff) {
266 std::cout << "left minlcp=" << int(minlcp)
267 << " lcp_diff=" << int(lcp_diff)
268 << " mid=" << left + (right-left)/2
271 if (left == right) return left;
272 mid = left + (right-left)/2;
273 if (str+pivots[mid] == query) return mid+1;
274 lcptype lcp_mid = lcps[mid];
275 if (lcp_mid > lcp_diff) {
282 if (lcp_mid == lcp_diff) {
283 lcptype lcp = minlcp - lcp_diff;
284 lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end,
286 if (newlcp == maxlcp) {
291 if (query+newlcp == str_end ||
292 query[newlcp] < str[pivots[mid]+newlcp]) {
295 lcp_diff = newlcp - minlcp;
296 if (lcp_diff == 0) goto in_the_middle;
297 else goto closer_to_right;
300 lcp_diff = minlcp - newlcp;
311 std::cout << "right minlcp=" << int(minlcp)
312 << " lcp_diff=" << int(lcp_diff)
313 << " mid=" << left + (right-left)/2
316 if (left == right) return left;
317 mid = left + (right-left)/2;
318 if (str+pivots[mid] == query) return mid+1;
319 lcptype lcp_mid = lcps[mid];
320 if (lcp_mid < lcp_diff) {
327 if (lcp_mid == lcp_diff) {
328 lcptype lcp = minlcp + lcp_diff;
329 lcptype newlcp = suffix_lcp(query, str+pivots[mid], str_end,
331 if (newlcp == maxlcp) {
336 if (query+newlcp == str_end ||
337 query[newlcp] < str[pivots[mid]+newlcp]) {
339 lcp_diff = newlcp - minlcp;
343 lcp_diff = minlcp - newlcp;
344 if (lcp_diff == 0) goto in_the_middle;
345 else goto closer_to_left;
353 // find the range of fully matching pivots
354 //std::cout << "fullmatch: left\n";
356 difftype right_tmp = mid;
357 while (left != right_tmp) {
358 difftype mid = left + (right_tmp-left)/2;
359 //std::cout << left << mid << right_tmp << std::endl;
360 difftype lcp_diff = lcps[mid];
361 //std::cout << " " << lcp_left << " " << lcp_diff << std::endl;
362 if (lcp_left + lcp_diff == maxlcp) {
365 if (lcp_diff > 0) { lcp_left += lcp_diff; }
370 //std::cout << "fullmatch: right\n";
372 difftype left_tmp = mid+1;
373 while (left_tmp != right) {
374 difftype mid = left_tmp + (right-left_tmp)/2;
375 //std::cout << left_tmp << mid << right << std::endl;
376 difftype lcp_diff = lcps[mid];
377 //std::cout << " " << lcp_right << " " << lcp_diff << std::endl;
378 if (lcp_right - lcp_diff == maxlcp) {
381 if (lcp_diff < 0) { lcp_right -= lcp_diff; }
386 //std::cout << "fullmatch: dcsearch\n";
387 while (left != right) {
388 difftype mid = left + (right-left)/2;
389 //std::cout << left << mid << right << std::endl;
390 if (dc_less_or_equal(pivots[mid], query-str)) {
405 #endif // DISTRIBUTE_HPP