1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
22 //#define DEBUG_MEMUSAGE
24 #include "HeapProfiler.h" // FIXME remove
34 #include <cstring> // For strlen()
43 // Save file version info
44 const uchar TCImplementation::versionFlag = 6;
47 * Constructor inits an empty dynamic FM-index.
48 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
51 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
52 suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
54 makewavelet(bwt); // Deletes bwt!
57 // Make sampling tables
61 bool TCImplementation::EmptyText(DocId k) const
63 assert(k < (DocId)numberOfTexts);
64 return false; // Empty texts are not indexed
67 uchar * TCImplementation::GetText(DocId k) const
69 assert(k < (DocId)numberOfTexts);
71 return textStorage->GetText(k);
72 /* TextPosition i = k;
75 // Reserve average string length to avoid reallocs
76 result.reserve(n/numberOfTexts);
78 uchar c = alphabetrank->access(i);
82 i = C[c]+alphabetrank->rank(c,i)-1;
84 c = alphabetrank->access(i); // "next" char.
87 // Convert to uchar (FIXME return string?)
89 uchar* res = new uchar[i+1];
91 for (ulong j = 0; j < i; ++j)
92 res[i-j-1] = result[j];
97 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
99 assert(k < (DocId)numberOfTexts);
100 assert(j < (*textLength)[k]);
105 // Start position of k'th text
106 ulong start = (*textStartPos)[k];
108 return Substring(i + start, j-i+1);
111 /******************************************************************
112 * Existential queries
114 bool TCImplementation::IsPrefix(uchar const * pattern) const
116 TextPosition m = strlen((char *)pattern);
120 TextPosition sp = 0, ep = 0;
121 Search(pattern, m, &sp, &ep);
123 // Check for end-marker(s) in result interval
124 if (CountEndmarkers(sp, ep))
129 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
131 TextPosition m = strlen((char *)pattern);
135 TextPosition sp = 0, ep = 0;
136 Search(pattern, m, &sp, &ep);
138 // Check for end-marker(s) in result interval
139 if (CountEndmarkers(sp, ep, begin, end))
145 bool TCImplementation::IsSuffix(uchar const *pattern) const
147 // Here counting is as fast as IsSuffix():
148 if (CountSuffix(pattern) > 0)
153 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
155 // Here counting is as fast as IsSuffix():
156 if (CountSuffix(pattern, begin, end) > 0)
161 bool TCImplementation::IsEqual(uchar const *pattern) const
163 TextPosition m = std::strlen((char *)pattern);
165 return false; // No empty texts exists
167 TextPosition sp = 0, ep = 0;
168 // Match including end-marker
169 Search(pattern, m+1, &sp, &ep);
171 // Check for end-marker(s) in result interval
172 if (CountEndmarkers(sp, ep))
177 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
179 TextPosition m = std::strlen((char *)pattern);
181 return false; // No empty texts exists
183 TextPosition sp = 0, ep = 0;
184 // Match including end-marker
185 Search(pattern, m+1, &sp, &ep, begin, end);
187 // Check for end-marker(s) in result interval
188 if (CountEndmarkers(sp, ep))
193 bool TCImplementation::IsContains(uchar const * pattern) const
195 TextPosition m = strlen((char *)pattern);
199 TextPosition sp = 0, ep = 0;
200 // Just check if pattern exists somewhere
201 ulong count = Search(pattern, m, &sp, &ep);
208 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
210 // Here counting is as fast as existential querying
211 if (CountContains(pattern, begin, end) > 0)
212 return true; // FIXME No need to filter result set
216 bool TCImplementation::IsLessThan(uchar const * pattern) const
218 if (CountLessThan(pattern) > 0)
223 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
225 if (CountLessThan(pattern, begin, end) > 0)
230 /******************************************************************
233 ulong TCImplementation::Count(uchar const * pattern) const
235 TextPosition m = strlen((char *)pattern);
239 TextPosition sp = 0, ep = 0;
240 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
244 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
246 TextPosition m = strlen((char *)pattern);
248 return numberOfTexts;
250 TextPosition sp = 0, ep = 0;
251 Search(pattern, m, &sp, &ep);
253 // Count end-markers in result interval
254 return CountEndmarkers(sp, ep);
257 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
259 TextPosition m = strlen((char *)pattern);
261 return numberOfTexts;
263 TextPosition sp = 0, ep = 0;
264 Search(pattern, m, &sp, &ep);
266 // Count end-markers in result interval
267 return CountEndmarkers(sp, ep, begin, end);
270 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
272 TextPosition m = strlen((char *)pattern);
274 return numberOfTexts;
276 TextPosition sp = 0, ep = 0;
277 // Search with end-marker
278 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
283 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
285 TextPosition m = strlen((char *)pattern);
287 return numberOfTexts;
289 TextPosition sp = 0, ep = 0;
290 // Search with end-marker
291 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
296 unsigned TCImplementation::CountEqual(uchar const *pattern) const
298 TextPosition m = strlen((char const *)pattern);
300 return 0; // No empty texts.
302 TextPosition sp = 0, ep = 0;
303 // Match including end-marker
304 Search(pattern, m+1, &sp, &ep);
306 // Count end-markers in result interval
307 return CountEndmarkers(sp, ep);
310 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
312 TextPosition m = strlen((char const *)pattern);
314 return 0; // No empty texts.
316 TextPosition sp = 0, ep = 0;
317 // Match including end-marker
318 Search(pattern, m+1, &sp, &ep, begin, end);
320 // Count end-markers in result interval
321 return CountEndmarkers(sp, ep); // Already within [begin, end]
324 unsigned TCImplementation::CountContains(uchar const * pattern) const
326 TextPosition m = strlen((char const *)pattern);
328 return numberOfTexts; // Total number of texts.
330 // Here counting is as slow as fetching the result set
331 // because we have to filter out occ's that fall within same document.
332 TextCollection::document_result result = Contains(pattern);
333 return result.size();
336 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
338 TextPosition m = strlen((char const *)pattern);
340 return numberOfTexts; // Total number of texts.
342 // Here counting is as slow as fetching the result set
343 // because we have to filter out occ's that fall within same document.
344 TextCollection::document_result result = Contains(pattern, begin, end);
345 return result.size();
348 // Less than or equal
349 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
351 TextPosition m = strlen((char const *)pattern);
353 return 0; // No empty texts.
355 TextPosition sp = 0, ep = 0;
356 SearchLessThan(pattern, m, &sp, &ep);
358 // Count end-markers in result interval
359 return CountEndmarkers(sp, ep);
362 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
364 TextPosition m = strlen((char const *)pattern);
366 return 0; // No empty texts.
368 TextPosition sp = 0, ep = 0;
369 SearchLessThan(pattern, m, &sp, &ep);
371 // Count end-markers in result interval
372 return CountEndmarkers(sp, ep, begin, end);
376 * Document reporting queries
378 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
380 TextPosition m = strlen((char *)pattern);
382 return TextCollection::document_result(); // FIXME Should return all 1...k
384 TextPosition sp = 0, ep = 0;
385 Search(pattern, m, &sp, &ep);
387 // Iterate through end-markers in [sp,ep]:
388 return EnumerateEndmarkers(sp, ep);
391 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
393 TextPosition m = strlen((char *)pattern);
395 return TextCollection::document_result(); // FIXME Should return all 1...k
397 TextPosition sp = 0, ep = 0;
398 Search(pattern, m, &sp, &ep);
400 // Return end-markers in [sp,ep] and [begin, end]:
401 return EnumerateEndmarkers(sp, ep, begin, end);
404 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
406 TextPosition m = strlen((char *)pattern);
408 return TextCollection::document_result(); // FIXME Should return all 1...k
410 TextPosition sp = 0, ep = 0;
411 // Search with end-marker
412 Search(pattern, m+1, &sp, &ep);
414 TextCollection::document_result result;
415 result.reserve(ep-sp+1); // Try to avoid reallocation.
417 // Check each occurrence
418 for (; sp <= ep; ++sp)
422 uchar c = alphabetrank->access(i);
423 while (c != '\0' && !sampled->access(i))
425 i = C[c]+alphabetrank->rank(c,i)-1;
426 c = alphabetrank->access(i);
428 // Assert: c == '\0' OR sampled->IsBitSet(i)
432 // Rank among the end-markers in BWT
433 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
434 result.push_back(Doc->access(endmarkerRank));
436 else // Sampled position
438 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
439 result.push_back(docId);
446 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
448 TextPosition m = strlen((char *)pattern);
450 return TextCollection::document_result(); // FIXME Should return all 1...k
452 TextPosition sp = 0, ep = 0;
453 // Search with end-marker
454 Search(pattern, m+1, &sp, &ep, begin, end);
456 TextCollection::document_result result;
457 result.reserve(ep-sp+1); // Try to avoid reallocation.
459 // Check each occurrence, already within [begin, end]
460 for (; sp <= ep; ++sp)
464 uchar c = alphabetrank->access(i);
465 while (c != '\0' && !sampled->access(i))
467 i = C[c]+alphabetrank->rank(c,i)-1;
468 c = alphabetrank->access(i);
470 // Assert: c == '\0' OR sampled->IsBitSet(i)
474 // Rank among the end-markers in BWT
475 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
476 result.push_back(Doc->access(endmarkerRank));
478 else // Sampled position
480 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
481 result.push_back(docId);
489 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
491 TextPosition m = strlen((char const *)pattern);
493 return TextCollection::document_result(); // FIXME Should return all empty texts
495 TextPosition sp = 0, ep = 0;
496 // Match including end-marker
497 Search(pattern, m+1, &sp, &ep);
499 // Report end-markers in result interval
500 return EnumerateEndmarkers(sp, ep);
503 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
505 TextPosition m = strlen((char const *)pattern);
507 return TextCollection::document_result(); // FIXME Should return all empty texts
509 TextPosition sp = 0, ep = 0;
510 // Match including end-marker
511 Search(pattern, m+1, &sp, &ep, begin, end);
513 // Report end-markers in result interval
514 return EnumerateEndmarkers(sp, ep, begin, end);
518 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
520 TextPosition m = strlen((char *)pattern);
522 return TextCollection::document_result();
524 TextPosition sp = 0, ep = 0;
525 // Search all occurrences
526 Search(pattern, m, &sp, &ep);
528 // We want unique document indentifiers, using std::set to collect them
529 std::set<DocId> resultSet;
530 EnumerateDocuments(resultSet, sp, ep);
532 // Convert std::set to std::vector
533 TextCollection::document_result result(resultSet.begin(), resultSet.end());
537 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
539 TextPosition m = strlen((char *)pattern);
541 return TextCollection::document_result();
543 TextPosition sp = 0, ep = 0;
544 // Search all occurrences
545 Search(pattern, m, &sp, &ep);
547 // We want unique document indentifiers, using std::set to collect them
548 std::set<DocId> resultSet;
549 EnumerateDocuments(resultSet, sp, ep, begin, end);
551 // Convert std::set to std::vector
552 TextCollection::document_result result(resultSet.begin(), resultSet.end());
556 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
558 TextPosition m = strlen((char *)pattern);
560 return TextCollection::document_result(); // empty result set
562 TextPosition sp = 0, ep = 0;
563 SearchLessThan(pattern, m, &sp, &ep);
565 // Report end-markers in result interval
566 return EnumerateEndmarkers(sp, ep);
569 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
571 TextPosition m = strlen((char *)pattern);
573 return TextCollection::document_result(); // empty result set
575 TextPosition sp = 0, ep = 0;
576 SearchLessThan(pattern, m, &sp, &ep);
578 // Iterate through end-markers in [sp,ep] and [begin, end]:
579 return EnumerateEndmarkers(sp, ep, begin, end);
583 TextCollection::document_result TCImplementation::Kmismaches(uchar const * pattern, unsigned k) const
585 TextPosition m = strlen((char *)pattern);
587 return TextCollection::document_result(); // empty result set
589 suffix_range_vector ranges;
590 kmismatches(ranges, pattern, 0, n-1, m, k);
591 std::set<DocId> resultSet;
593 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
594 // Iterate through docs in [sp,ep]:
595 EnumerateDocuments(resultSet, (*it).first, (*it).second);
597 // Convert std::set to std::vector
598 TextCollection::document_result result(resultSet.begin(), resultSet.end());
602 TextCollection::document_result TCImplementation::Kerrors(uchar const * pattern, unsigned k) const
604 TextPosition m = strlen((char *)pattern);
606 return TextCollection::document_result(); // empty result set
608 suffix_range_vector ranges;
609 ulong *dd = new ulong[m+1];
610 for (ulong i=0;i<m+1;i++)
612 kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
615 std::set<DocId> resultSet;
616 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
617 // Iterate through docs in [sp,ep]:
618 EnumerateDocuments(resultSet, (*it).first, (*it).second);
620 // Convert std::set to std::vector
621 TextCollection::document_result result(resultSet.begin(), resultSet.end());
627 * Full result set queries
629 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
631 TextPosition m = strlen((char *)pattern);
633 return full_result(); // FIXME Throw exception?
635 TextPosition sp = 0, ep = 0;
636 // Search all occurrences
637 Search(pattern, m, &sp, &ep);
640 result.reserve(ep-sp+1); // Try to avoid reallocation.
641 EnumeratePositions(result, sp, ep);
646 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
648 TextPosition m = strlen((char *)pattern);
650 return full_result(); // FIXME Throw exception?
652 TextPosition sp = 0, ep = 0;
653 // Search all occurrences
654 Search(pattern, m, &sp, &ep);
657 result.reserve(ep-sp+1); // Try to avoid reallocation.
658 EnumeratePositions(result, sp, ep, begin, end);
663 TextCollection::full_result TCImplementation::FullKmismatches(uchar const * pattern, unsigned k) const
665 TextPosition m = strlen((char *)pattern);
667 return TextCollection::full_result(); // empty result set
669 suffix_range_vector ranges;
670 ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
672 TextCollection::full_result result;
673 result.reserve(count); // avoid reallocation.
674 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
675 // Iterate through docs in [sp,ep]:
676 EnumeratePositions(result, (*it).first, (*it).second);
680 TextCollection::full_result TCImplementation::FullKerrors(uchar const * pattern, unsigned k) const
682 TextPosition m = strlen((char *)pattern);
684 return TextCollection::full_result(); // empty result set
686 suffix_range_vector ranges;
687 ulong *dd = new ulong[m+1];
688 for (unsigned i=0;i<m+1;i++)
690 ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
693 TextCollection::full_result result;
694 result.reserve(count); // avoid reallocation.
695 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
696 // Iterate through docs in [sp,ep]:
697 EnumeratePositions(result, (*it).first, (*it).second);
703 * Save index to a file handle
705 * Throws a std::runtime_error exception on i/o error.
706 * First byte that is saved represents the version number of the save file.
707 * In version 2 files, the data fields are:
712 TextPosition bwtEndPos;
713 static_sequence * alphabetrank;
715 BlockArray *suffixes;
716 BlockArray *suffixDocId;
717 unsigned numberOfTexts;
719 static_sequence *docId;
721 void TCImplementation::Save(FILE *file) const
723 // Saving version info:
724 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
725 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
727 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
728 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
729 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
730 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
732 for(ulong i = 0; i < 256; ++i)
733 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
734 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
736 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
737 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
739 alphabetrank->save(file);
741 suffixes->Save(file);
742 suffixDocId->Save(file);
744 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
745 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
746 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
747 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
750 textStorage->Save(file);
756 * Load index from a file handle
758 * Throws a std::runtime_error exception on i/o error.
759 * For more info, see TCImplementation::Save().
761 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
762 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
763 suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
766 if (std::fread(&verFlag, 1, 1, file) != 1)
767 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
768 if (verFlag != TCImplementation::versionFlag)
769 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
771 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
772 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
773 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
774 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
775 // FIXME samplerate can not be changed during load.
776 // if (this->samplerate == 0)
777 // this->samplerate = samplerate;
779 for(ulong i = 0; i < 256; ++i)
780 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
781 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
783 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
784 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
786 alphabetrank = static_sequence::load(file);
787 sampled = static_bitsequence::load(file);
788 suffixes = new BlockArray(file);
789 suffixDocId = new BlockArray(file);
791 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
792 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
793 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
794 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
796 Doc = static_sequence::load(file);
797 textStorage = new TextStorage(file);
799 // FIXME Construct data structures with new samplerate
806 * Rest of the functions follow...
808 ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
811 while (*sp<=*ep && i>=1)
813 c = (int)pattern[--i];
814 *sp = C[c]+alphabetrank->rank(c,*sp-1);
815 *ep = C[c]+alphabetrank->rank(c,*ep)-1;
818 return *ep - *sp + 1;
824 ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
829 result.push_back(std::make_pair(sp,ep));
839 sum = searchPrefix(pattern, j, &sp, &ep);
841 result.push_back(std::make_pair(sp, ep));
844 vector<int> chars = alphabetrank->accessAll(sp, ep);
845 for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
848 continue; // skip '\0'
850 spnew = C[c]+alphabetrank->rank(c,sp-1);
851 epnew = C[c]+alphabetrank->rank(c,ep)-1;
852 if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
853 if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
858 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
859 ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
861 cout << "j = " << j << ", k = " << k << ", d:";
862 for (unsigned i = 0; i < m+1; ++i)
866 if (d[m]<=k) // range of suffixes with at most k-errors found
869 result.push_back(std::make_pair(sp, ep));
870 return (sp<=ep)?ep-sp+1:0ul;
874 ulong *dnew = new ulong[m+1];
880 vector<int> chars = alphabetrank->accessAll(sp, ep);
881 for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
884 continue; // skip '\0'
886 spnew = C[c]+alphabetrank->rank(c,sp-1);
887 epnew = C[c]+alphabetrank->rank(c,ep)-1;
888 if (spnew>epnew) continue;
891 for (p=1; p<=m; p++) {
892 dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
893 if (dnew[p]<lowerbound)
894 lowerbound = dnew[p];
897 sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
904 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
906 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
907 int c = (int)pattern[m-1];
909 TextPosition sp = C[c];
910 TextPosition ep = C[c+1]-1;
911 while (sp<=ep && i>=1)
913 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
914 c = (int)pattern[--i];
915 sp = C[c]+alphabetrank->rank(c,sp-1);
916 ep = C[c]+alphabetrank->rank(c,ep)-1;
926 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
928 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
929 int c = (int)pattern[m-1];
930 assert(c == 0); // Start from endmarkers
932 TextPosition sp = begin;
933 TextPosition ep = end;
934 while (sp<=ep && i>=1)
936 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
937 c = (int)pattern[--i];
938 sp = C[c]+alphabetrank->rank(c,sp-1);
939 ep = C[c]+alphabetrank->rank(c,ep)-1;
950 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
952 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
953 uint c = (int)pattern[m-1];
956 TextPosition ep = C[c+1]-1;
957 while (sp<=ep && i>=1)
959 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
960 c = (int)pattern[--i];
961 uint result = alphabetrank->rankLessThan(c,ep);
976 TCImplementation::~TCImplementation() {
985 void TCImplementation::makewavelet(uchar *bwt)
994 if (C[i]>0) {min = i; break;}
995 for (i=255;i>=min;--i)
996 if (C[i]>0) {max = i; break;}
998 ulong prev=C[0], temp;
1000 for (i=1;i<256;i++) {
1005 // this->codetable = node::makecodetable(bwt,n);
1006 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1008 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1009 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1011 #ifdef DEBUG_MEMUSAGE
1012 std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1013 HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1016 alphabet_mapper * am = new alphabet_mapper_none();
1017 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1018 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1019 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1021 bwt = 0; // already deleted
1023 #ifdef DEBUG_MEMUSAGE
1024 std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1025 std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1029 void TCImplementation::maketables()
1031 // Calculate BWT end-marker position (of last inserted text)
1034 uint alphabetrank_i_tmp = 0;
1035 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
1038 i = C[c]+alphabetrank_i_tmp-1;
1039 c = alphabetrank->access(i, alphabetrank_i_tmp);
1042 this->bwtEndPos = i;
1045 // Build up array for text starting positions
1046 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1047 (*textStartPos)[0] = 0;
1049 // Mapping from end-markers to doc ID's:
1050 uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs.
1052 uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1053 for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1054 sampledpositions[i] = 0;
1056 ulong x,p=bwtEndPos;
1057 ulong sampleCount = 0;
1058 // Keeping track of text position of prev. end-marker seen
1059 ulong posOfSuccEndmarker = n;
1060 DocId textId = numberOfTexts;
1063 uint alphabetrank_i_tmp =0;
1066 * First pass: populate tables textStartPos and sampledpositions.
1068 for (ulong i=n-1;i<ulongmax;i--) {
1069 // i substitutes SA->GetPos(i)
1072 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1073 set_field(sampledpositions,1,p,1);
1077 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1082 // Record the order of end-markers in BWT:
1083 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1084 endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts;
1086 // Store text length and text start position:
1087 if (textId < (DocId)numberOfTexts - 1)
1089 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1090 posOfSuccEndmarker = x;
1093 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1094 p = textId; // Correct LF-mapping to the last char of the previous text.
1096 else // Now c != '\0', do LF-mapping:
1097 p = C[c]+alphabetrank_i_tmp-1;
1099 assert(textId == 0);
1101 sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1102 delete [] sampledpositions;
1103 ulong sampleLength = sampled->rank1(n-1);
1104 assert(sampleCount == sampleLength);
1106 // Suffixes store an offset from the text start position
1107 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1108 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1111 textId = numberOfTexts;
1113 TextStorageBuilder tsbuilder(n);
1116 * Second pass: populate tables suffixes and suffixDocId.
1118 for (ulong i=n-1;i<ulongmax;i--) {
1121 if (sampled->access(p)) {
1122 ulong j = sampled->rank1(p)-1;
1124 (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x);
1126 // calculate offset from text start:
1127 (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]];
1130 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1136 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1137 p = textId; // Correct LF-mapping to the last char of the previous text.
1139 else // Now c != '\0', do LF-mapping:
1140 p = C[c]+alphabetrank_i_tmp-1;
1142 assert(textId == 0);
1143 delete textStartPos;
1145 textStorage = tsbuilder.InitTextStorage();
1147 #ifdef DEBUG_MEMUSAGE
1148 std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1149 HeapProfiler::ResetMaxHeapConsumption();
1152 alphabet_mapper * am = new alphabet_mapper_none();
1153 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1154 Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true);
1156 // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1158 #ifdef DEBUG_MEMUSAGE
1159 std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1165 * Finds document identifier for given text position
1167 * Starting text position of the document is stored into second parameter.
1168 * Binary searching on text starting positions.
1170 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1175 DocId b = numberOfTexts - 1;
1178 DocId c = a + (b - a)/2;
1179 if ((*textStartPos)[c] > i)
1181 else if ((*textStartPos)[c+1] > i)
1187 assert(a < (DocId)numberOfTexts);
1188 assert(i >= (*textStartPos)[a]);
1189 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));