1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
22 //#define DEBUG_MEMUSAGE
24 #include "HeapProfiler.h" // FIXME remove
34 #include <cstring> // For strlen()
43 // Save file version info
44 const uchar TCImplementation::versionFlag = 8;
47 * Constructor inits an empty dynamic FM-index.
48 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_,
51 unsigned numberOfTexts_, ulong maxTextLength_, ulong numberOfSamples_,
52 CSA::DeltaVector & notIndexed, const string & niText, char tsType)
53 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
54 suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
56 makewavelet(bwt); // Deletes bwt!
59 // Make sampling tables
60 maketables(numberOfSamples_, tsType, notIndexed, niText);
63 bool TCImplementation::EmptyText(DocId k) const
65 assert(k < (DocId)numberOfTexts);
66 return false; // Empty texts are not indexed
69 uchar * TCImplementation::GetText(DocId k) const
71 assert(k < (DocId)numberOfTexts);
73 return textStorage->GetText(k);
74 /* TextPosition i = k;
77 // Reserve average string length to avoid reallocs
78 result.reserve(n/numberOfTexts);
80 uchar c = alphabetrank->access(i);
84 i = C[c]+alphabetrank->rank(c,i)-1;
86 c = alphabetrank->access(i); // "next" char.
89 // Convert to uchar (FIXME return string?)
91 uchar* res = new uchar[i+1];
93 for (ulong j = 0; j < i; ++j)
94 res[i-j-1] = result[j];
99 * Substring queries are supported via the pointer returned by TextStorage::GetText
100 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
102 assert(k < (DocId)numberOfTexts);
103 assert(j < (*textLength)[k]);
108 // Start position of k'th text
109 ulong start = (*textStartPos)[k];
111 return Substring(i + start, j-i+1);
116 /******************************************************************
117 * Existential queries
119 bool TCImplementation::IsPrefix(uchar const * pattern) const
121 TextPosition m = strlen((char *)pattern);
125 TextPosition sp = 0, ep = 0;
126 Search(pattern, m, &sp, &ep);
128 // Check for end-marker(s) in result interval
129 if (CountEndmarkers(sp, ep))
134 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
136 TextPosition m = strlen((char *)pattern);
140 TextPosition sp = 0, ep = 0;
141 Search(pattern, m, &sp, &ep);
143 // Check for end-marker(s) in result interval
144 if (CountEndmarkers(sp, ep, begin, end))
150 bool TCImplementation::IsSuffix(uchar const *pattern) const
152 // Here counting is as fast as IsSuffix():
153 if (CountSuffix(pattern) > 0)
158 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
160 // Here counting is as fast as IsSuffix():
161 if (CountSuffix(pattern, begin, end) > 0)
166 bool TCImplementation::IsEqual(uchar const *pattern) const
168 TextPosition m = std::strlen((char *)pattern);
170 return false; // No empty texts exists
172 TextPosition sp = 0, ep = 0;
173 // Match including end-marker
174 Search(pattern, m+1, &sp, &ep);
176 // Check for end-marker(s) in result interval
177 if (CountEndmarkers(sp, ep))
182 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
184 TextPosition m = std::strlen((char *)pattern);
186 return false; // No empty texts exists
188 TextPosition sp = 0, ep = 0;
189 // Match including end-marker
190 Search(pattern, m+1, &sp, &ep, begin, end);
192 // Check for end-marker(s) in result interval
193 if (CountEndmarkers(sp, ep))
198 bool TCImplementation::IsContains(uchar const * pattern) const
200 TextPosition m = strlen((char *)pattern);
204 TextPosition sp = 0, ep = 0;
205 // Just check if pattern exists somewhere
206 ulong count = Search(pattern, m, &sp, &ep);
213 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
215 // Here counting is as fast as existential querying
216 if (CountContains(pattern, begin, end) > 0)
217 return true; // FIXME No need to filter result set
221 bool TCImplementation::IsLessThan(uchar const * pattern) const
223 if (CountLessThan(pattern) > 0)
228 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
230 if (CountLessThan(pattern, begin, end) > 0)
235 /******************************************************************
238 ulong TCImplementation::Count(uchar const * pattern) const
240 TextPosition m = strlen((char *)pattern);
244 TextPosition sp = 0, ep = 0;
245 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
249 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
251 TextPosition m = strlen((char *)pattern);
253 return numberOfTexts;
255 TextPosition sp = 0, ep = 0;
256 Search(pattern, m, &sp, &ep);
258 // Count end-markers in result interval
259 return CountEndmarkers(sp, ep);
262 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
264 TextPosition m = strlen((char *)pattern);
266 return numberOfTexts;
268 TextPosition sp = 0, ep = 0;
269 Search(pattern, m, &sp, &ep);
271 // Count end-markers in result interval
272 return CountEndmarkers(sp, ep, begin, end);
275 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
277 TextPosition m = strlen((char *)pattern);
279 return numberOfTexts;
281 TextPosition sp = 0, ep = 0;
282 // Search with end-marker
283 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
288 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
290 TextPosition m = strlen((char *)pattern);
292 return numberOfTexts;
294 TextPosition sp = 0, ep = 0;
295 // Search with end-marker
296 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
301 unsigned TCImplementation::CountEqual(uchar const *pattern) const
303 TextPosition m = strlen((char const *)pattern);
305 return 0; // No empty texts.
307 TextPosition sp = 0, ep = 0;
308 // Match including end-marker
309 Search(pattern, m+1, &sp, &ep);
311 // Count end-markers in result interval
312 return CountEndmarkers(sp, ep);
315 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
317 TextPosition m = strlen((char const *)pattern);
319 return 0; // No empty texts.
321 TextPosition sp = 0, ep = 0;
322 // Match including end-marker
323 Search(pattern, m+1, &sp, &ep, begin, end);
325 // Count end-markers in result interval
326 return CountEndmarkers(sp, ep); // Already within [begin, end]
329 unsigned TCImplementation::CountContains(uchar const * pattern) const
331 TextPosition m = strlen((char const *)pattern);
333 return numberOfTexts; // Total number of texts.
335 // Here counting is as slow as fetching the result set
336 // because we have to filter out occ's that fall within same document.
337 TextCollection::document_result result = Contains(pattern);
338 return result.size();
341 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
343 TextPosition m = strlen((char const *)pattern);
345 return numberOfTexts; // Total number of texts.
347 // Here counting is as slow as fetching the result set
348 // because we have to filter out occ's that fall within same document.
349 TextCollection::document_result result = Contains(pattern, begin, end);
350 return result.size();
353 // Less than or equal
354 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
356 TextPosition m = strlen((char const *)pattern);
358 return 0; // No empty texts.
360 TextPosition sp = 0, ep = 0;
361 SearchLessThan(pattern, m, &sp, &ep);
363 // Count end-markers in result interval
364 return CountEndmarkers(sp, ep);
367 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
369 TextPosition m = strlen((char const *)pattern);
371 return 0; // No empty texts.
373 TextPosition sp = 0, ep = 0;
374 SearchLessThan(pattern, m, &sp, &ep);
376 // Count end-markers in result interval
377 return CountEndmarkers(sp, ep, begin, end);
381 * Document reporting queries
383 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
385 TextPosition m = strlen((char *)pattern);
387 return TextCollection::document_result(); // FIXME Should return all 1...k
389 TextPosition sp = 0, ep = 0;
390 Search(pattern, m, &sp, &ep);
392 // Iterate through end-markers in [sp,ep]:
393 return EnumerateEndmarkers(sp, ep);
396 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
398 TextPosition m = strlen((char *)pattern);
400 return TextCollection::document_result(); // FIXME Should return all 1...k
402 TextPosition sp = 0, ep = 0;
403 Search(pattern, m, &sp, &ep);
405 // Return end-markers in [sp,ep] and [begin, end]:
406 return EnumerateEndmarkers(sp, ep, begin, end);
409 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
411 TextPosition m = strlen((char *)pattern);
413 return TextCollection::document_result(); // FIXME Should return all 1...k
415 TextPosition sp = 0, ep = 0;
416 // Search with end-marker
417 Search(pattern, m+1, &sp, &ep);
419 TextCollection::document_result result;
420 result.reserve(ep-sp+1); // Try to avoid reallocation.
422 // Check each occurrence
423 for (; sp <= ep; ++sp)
427 uchar c = alphabetrank->access(i);
428 while (c != '\0' && !sampled->access(i))
430 i = C[c]+alphabetrank->rank(c,i)-1;
431 c = alphabetrank->access(i);
433 // Assert: c == '\0' OR sampled->IsBitSet(i)
437 // Rank among the end-markers in BWT
438 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
439 result.push_back(Doc->access(endmarkerRank));
441 else // Sampled position
443 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
444 result.push_back(docId);
451 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
453 TextPosition m = strlen((char *)pattern);
455 return TextCollection::document_result(); // FIXME Should return all 1...k
457 TextPosition sp = 0, ep = 0;
458 // Search with end-marker
459 Search(pattern, m+1, &sp, &ep, begin, end);
461 TextCollection::document_result result;
462 result.reserve(ep-sp+1); // Try to avoid reallocation.
464 // Check each occurrence, already within [begin, end]
465 for (; sp <= ep; ++sp)
469 uchar c = alphabetrank->access(i);
470 while (c != '\0' && !sampled->access(i))
472 i = C[c]+alphabetrank->rank(c,i)-1;
473 c = alphabetrank->access(i);
475 // Assert: c == '\0' OR sampled->IsBitSet(i)
479 // Rank among the end-markers in BWT
480 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
481 result.push_back(Doc->access(endmarkerRank));
483 else // Sampled position
485 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
486 result.push_back(docId);
494 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
496 TextPosition m = strlen((char const *)pattern);
498 return TextCollection::document_result(); // FIXME Should return all empty texts
500 TextPosition sp = 0, ep = 0;
501 // Match including end-marker
502 Search(pattern, m+1, &sp, &ep);
504 // Report end-markers in result interval
505 return EnumerateEndmarkers(sp, ep);
508 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
510 TextPosition m = strlen((char const *)pattern);
512 return TextCollection::document_result(); // FIXME Should return all empty texts
514 TextPosition sp = 0, ep = 0;
515 // Match including end-marker
516 Search(pattern, m+1, &sp, &ep, begin, end);
518 // Report end-markers in result interval
519 return EnumerateEndmarkers(sp, ep, begin, end);
523 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
525 TextPosition m = strlen((char *)pattern);
527 return TextCollection::document_result();
529 TextPosition sp = 0, ep = 0;
530 // Search all occurrences
531 Search(pattern, m, &sp, &ep);
533 // We want unique document indentifiers, using std::set to collect them
534 std::set<DocId> resultSet;
535 EnumerateDocuments(resultSet, sp, ep);
537 // Convert std::set to std::vector
538 TextCollection::document_result result(resultSet.begin(), resultSet.end());
542 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
544 TextPosition m = strlen((char *)pattern);
546 return TextCollection::document_result();
548 TextPosition sp = 0, ep = 0;
549 // Search all occurrences
550 Search(pattern, m, &sp, &ep);
552 // We want unique document indentifiers, using std::set to collect them
553 std::set<DocId> resultSet;
554 EnumerateDocuments(resultSet, sp, ep, begin, end);
556 // Convert std::set to std::vector
557 TextCollection::document_result result(resultSet.begin(), resultSet.end());
564 * * FIXME Lessthan or equal
566 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
568 TextPosition m = strlen((char *)pattern);
570 return TextCollection::document_result(); // empty result set
572 TextPosition sp = 0, ep = 0;
573 SearchLessThan(pattern, m, &sp, &ep);
575 // Report end-markers in result interval
576 return EnumerateEndmarkers(sp, ep);
579 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
581 TextPosition m = strlen((char *)pattern);
583 return TextCollection::document_result(); // empty result set
585 TextPosition sp = 0, ep = 0;
586 SearchLessThan(pattern, m, &sp, &ep);
588 // Iterate through end-markers in [sp,ep] and [begin, end]:
589 return EnumerateEndmarkers(sp, ep, begin, end);
593 TextCollection::document_result TCImplementation::KMismaches(uchar const * pattern, unsigned k) const
595 TextPosition m = strlen((char *)pattern);
597 return TextCollection::document_result(); // empty result set
599 suffix_range_vector ranges;
600 kmismatches(ranges, pattern, 0, n-1, m, k);
601 std::set<DocId> resultSet;
603 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
604 // Iterate through docs in [sp,ep]:
605 EnumerateDocuments(resultSet, (*it).first, (*it).second);
607 // Convert std::set to std::vector
608 TextCollection::document_result result(resultSet.begin(), resultSet.end());
612 TextCollection::document_result TCImplementation::KErrors(uchar const * pattern, unsigned k) const
614 TextPosition m = strlen((char *)pattern);
616 return TextCollection::document_result(); // empty result set
618 suffix_range_vector ranges;
619 ulong *dd = new ulong[m+1];
620 for (ulong i=0;i<m+1;i++)
622 kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
625 std::set<DocId> resultSet;
626 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
627 // Iterate through docs in [sp,ep]:
628 EnumerateDocuments(resultSet, (*it).first, (*it).second);
630 // Convert std::set to std::vector
631 TextCollection::document_result result(resultSet.begin(), resultSet.end());
637 * Full result set queries
639 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
641 TextPosition m = strlen((char *)pattern);
643 return full_result(); // FIXME Throw exception?
645 TextPosition sp = 0, ep = 0;
646 // Search all occurrences
647 Search(pattern, m, &sp, &ep);
650 result.reserve(ep-sp+1); // Try to avoid reallocation.
651 EnumeratePositions(result, sp, ep);
656 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
658 TextPosition m = strlen((char *)pattern);
660 return full_result(); // FIXME Throw exception?
662 TextPosition sp = 0, ep = 0;
663 // Search all occurrences
664 Search(pattern, m, &sp, &ep);
667 result.reserve(ep-sp+1); // Try to avoid reallocation.
668 EnumeratePositions(result, sp, ep, begin, end);
673 TextCollection::full_result TCImplementation::FullKMismatches(uchar const * pattern, unsigned k) const
675 TextPosition m = strlen((char *)pattern);
677 return TextCollection::full_result(); // empty result set
679 suffix_range_vector ranges;
680 ulong count = kmismatches(ranges, pattern, 0, n-1, m, k);
682 TextCollection::full_result result;
683 result.reserve(count); // avoid reallocation.
684 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
685 // Iterate through docs in [sp,ep]:
686 EnumeratePositions(result, (*it).first, (*it).second);
690 TextCollection::full_result TCImplementation::FullKErrors(uchar const * pattern, unsigned k) const
692 TextPosition m = strlen((char *)pattern);
694 return TextCollection::full_result(); // empty result set
696 suffix_range_vector ranges;
697 ulong *dd = new ulong[m+1];
698 for (unsigned i=0;i<m+1;i++)
700 ulong count = kerrors(ranges, pattern, 0, n-1, m+k, k, dd, m);
703 TextCollection::full_result result;
704 result.reserve(count); // avoid reallocation.
705 for (suffix_range_vector::iterator it = ranges.begin(); it != ranges.end(); ++it)
706 // Iterate through docs in [sp,ep]:
707 EnumeratePositions(result, (*it).first, (*it).second);
713 * Save index to a file handle
715 * Throws a std::runtime_error exception on i/o error.
716 * First byte that is saved represents the version number of the save file.
717 * In version 2 files, the data fields are:
722 TextPosition bwtEndPos;
723 static_sequence * alphabetrank;
725 BlockArray *suffixes;
726 BlockArray *suffixDocId;
727 unsigned numberOfTexts;
729 static_sequence *docId;
731 void TCImplementation::Save(FILE *file) const
733 // Saving version info:
734 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
735 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
737 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
738 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
739 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
740 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
742 for(ulong i = 0; i < 256; ++i)
743 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
744 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
746 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
747 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
749 alphabetrank->save(file);
751 suffixes->Save(file);
752 suffixDocId->Save(file);
754 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
755 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
756 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
757 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
760 textStorage->Save(file);
766 * Load index from a file handle
768 * Throws a std::runtime_error exception on i/o error.
769 * For more info, see TCImplementation::Save().
771 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
772 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
773 suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
775 // Tools::StartTimer();
776 // std::cout << std::endl << "Loading..."<< std::endl;
779 if (std::fread(&verFlag, 1, 1, file) != 1)
780 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
781 if (verFlag != TCImplementation::versionFlag)
782 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
784 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
785 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
786 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
787 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
788 // FIXME samplerate can not be changed during load.
789 // if (this->samplerate == 0)
790 // this->samplerate = samplerate;
792 for(ulong i = 0; i < 256; ++i)
793 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
794 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
796 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
797 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
799 // std::cout << "Loading alphabet rank (" << Tools::GetTime() << " s)." << std::endl;
800 alphabetrank = static_sequence::load(file);
801 // std::cout << "Loading samples (" << Tools::GetTime() << " s)." << std::endl;
802 sampled = static_bitsequence::load(file);
803 suffixes = new BlockArray(file);
804 suffixDocId = new BlockArray(file);
806 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
807 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
808 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
809 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
811 // std::cout << "Loading Doc (" << Tools::GetTime() << " s)." << std::endl;
812 Doc = new ArrayDoc(file); //static_sequence::load(file);
813 // std::cout << "Loading text storage (" << Tools::GetTime() << " s)." << std::endl;
814 textStorage = TextStorage::Load(file);
816 // std::cout << "Loading done(" << Tools::GetTime() << " s)." << std::endl;
818 // FIXME Construct data structures with new samplerate
825 * Rest of the functions follow...
827 ulong TCImplementation::searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const
830 while (*sp<=*ep && i>=1)
832 c = (int)pattern[--i];
833 *sp = C[c]+alphabetrank->rank(c,*sp-1);
834 *ep = C[c]+alphabetrank->rank(c,*ep)-1;
837 return *ep - *sp + 1;
843 ulong TCImplementation::kmismatches(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k) const
848 result.push_back(std::make_pair(sp,ep));
858 sum = searchPrefix(pattern, j, &sp, &ep);
860 result.push_back(std::make_pair(sp, ep));
863 vector<int> chars = alphabetrank->accessAll(sp, ep);
864 for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
867 continue; // skip '\0'
869 spnew = C[c]+alphabetrank->rank(c,sp-1);
870 epnew = C[c]+alphabetrank->rank(c,ep)-1;
871 if (c!=pattern[j-1]) knew = (int)k-1; else knew = k;
872 if (knew>=0) sum += kmismatches(result, pattern, spnew, epnew, j-1, knew);
877 //first call kerrors(pattern,1,n,m+k,k,d,m), where d[i]=i
878 ulong TCImplementation::kerrors(suffix_range_vector &result, uchar const *pattern, ulong sp, ulong ep, ulong j, unsigned k, ulong const *d, ulong m) const
881 if (d[m]<=k) // range of suffixes with at most k-errors found
884 result.push_back(std::make_pair(sp, ep));
885 sum += (sp<=ep)?ep-sp+1:0ul;
889 ulong *dnew = new ulong[m+1];
894 vector<int> chars = alphabetrank->accessAll(sp, ep);
895 for (vector<int>::iterator it = chars.begin(); it != chars.end(); ++it)
898 continue; // skip '\0'
900 spnew = C[c]+alphabetrank->rank(c,sp-1);
901 epnew = C[c]+alphabetrank->rank(c,ep)-1;
902 if (spnew>epnew) continue;
905 for (p=1; p<=m; p++) {
906 dnew[p]=myminofthree(d[p]+1,dnew[p-1]+1,(c==pattern[m-p])?d[p-1]:(d[p-1]+1));
907 if (dnew[p]<lowerbound)
908 lowerbound = dnew[p];
911 sum += kerrors(result, pattern, spnew, epnew, j-1, k,dnew,m);
918 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
920 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
921 int c = (int)pattern[m-1];
923 TextPosition sp = C[c];
924 TextPosition ep = C[c+1]-1;
925 while (sp<=ep && i>=1)
927 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
928 c = (int)pattern[--i];
929 sp = C[c]+alphabetrank->rank(c,sp-1);
930 ep = C[c]+alphabetrank->rank(c,ep)-1;
940 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
942 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
943 int c = (int)pattern[m-1];
944 assert(c == 0); // Start from endmarkers
946 TextPosition sp = begin;
947 TextPosition ep = end;
948 while (sp<=ep && i>=1)
950 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
951 c = (int)pattern[--i];
952 sp = C[c]+alphabetrank->rank(c,sp-1);
953 ep = C[c]+alphabetrank->rank(c,ep)-1;
964 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
966 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
967 uint c = (int)pattern[m-1];
970 TextPosition ep = C[c+1]-1;
971 while (sp<=ep && i>=1)
973 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
974 c = (int)pattern[--i];
975 uint result = alphabetrank->rankLessThan(c,ep);
990 TCImplementation::~TCImplementation() {
999 void TCImplementation::makewavelet(uchar *bwt)
1008 if (C[i]>0) {min = i; break;}
1009 for (i=255;i>=min;--i)
1010 if (C[i]>0) {max = i; break;}
1012 ulong prev=C[0], temp;
1014 for (i=1;i<256;i++) {
1019 // this->codetable = node::makecodetable(bwt,n);
1020 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1022 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1023 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1025 #ifdef DEBUG_MEMUSAGE
1026 std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1027 HeapProfiler::ResetMaxHeapConsumption();
1030 alphabet_mapper * am = new alphabet_mapper_none();
1031 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(8); //rrr02(8); // FIXME samplerate?
1032 wt_coder * wtc = new wt_coder_huff(bwt,n,am);//binary(bwt,n,am); // FIXME Huffman shape
1033 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1035 bwt = 0; // already deleted
1037 #ifdef DEBUG_MEMUSAGE
1038 std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1039 std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1043 void TCImplementation::maketables(ulong sampleLength, char tsType, CSA::DeltaVector & notIndexed, const string & niText)
1045 // Calculate BWT end-marker position (of last inserted text)
1048 uint alphabetrank_i_tmp = 0;
1049 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
1052 i = C[c]+alphabetrank_i_tmp-1;
1053 c = alphabetrank->access(i, alphabetrank_i_tmp);
1056 this->bwtEndPos = i;
1059 #ifdef DEBUG_MEMUSAGE
1060 std::cerr << "heap usage before BWT traverse: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
1061 HeapProfiler::ResetMaxHeapConsumption();
1064 // Build up array for text starting positions
1065 // BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1066 // (*textStartPos)[0] = 0;
1068 // Mapping from end-markers to doc ID's:
1069 unsigned logNumberOfTexts = Tools::CeilLog2(numberOfTexts);
1070 // uint *endmarkerDocId = new uint[(numberOfTexts * logNumberOfTexts)/(8*sizeof(uint)) + 1];
1071 BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, logNumberOfTexts);
1073 BlockArray* positions = new BlockArray(sampleLength, Tools::CeilLog2(this->n));
1074 uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
1075 for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
1076 sampledpositions[i] = 0;
1078 ulong x,p=bwtEndPos;
1079 ulong sampleCount = 0;
1080 // Keeping track of text position of prev. end-marker seen
1081 ulong posOfSuccEndmarker = n-1;
1082 DocId textId = numberOfTexts;
1085 uint alphabetrank_i_tmp =0;
1087 // Text length = n + number of bytes not indexed.
1088 TextStorageBuilder tsbuilder(n + niText.length());
1089 ulong tsb_i = n + niText.length(); // Iterator from text length to 0.
1090 string::const_reverse_iterator nit_i = niText.rbegin(); // Iterator through non-indexed texts
1092 for (ulong i=n-1;i<ulongmax;i--) {
1093 // i substitutes SA->GetPos(i)
1096 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1098 tsbuilder[--tsb_i] = c; // Build TextStorage
1100 if ((posOfSuccEndmarker - i) % samplerate == 0 && c != '\0')
1102 set_field(sampledpositions,1,p,1);
1103 (*positions)[sampleCount] = p;
1109 unsigned prevTextId = textId; // Cache textId value.
1112 * At first c == '\0' it holds that (prevTextId == numberOfTexts), thus,
1113 * we have to search for the first text that is actually *indexed*
1114 * to get correct prevTextId.
1116 if (prevTextId == numberOfTexts)
1119 while (notIndexed.isSet(prevTextId))
1121 // Now prevTextId points to the first indexed Doc ID.
1125 * Insert non-indexed texts
1127 while (notIndexed.isSet(textId))
1130 tsbuilder[tsb_i] = *nit_i;
1133 } while (nit_i != niText.rend() && *nit_i != '\0');
1135 tsbuilder[tsb_i] = '\0';
1142 // Record the order of end-markers in BWT:
1143 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1144 //set_field(endmarkerDocId, logNumberOfTexts, endmarkerRank, (textId + 1) % numberOfTexts);
1145 (*endmarkerDocId)[endmarkerRank] = prevTextId % numberOfTexts;
1147 // Store text length and text start position:
1148 if (textId < (DocId)numberOfTexts - 1)
1150 // (*textStartPos)[textId + 1] = x; // x-1 is text position of end-marker.
1152 posOfSuccEndmarker = i;
1155 // LF-mapping from '\0' does not work with this (pseudo) BWT.
1156 // Correct LF-mapping to the last char of the previous text:
1157 p = textId - notIndexed.rank(textId);
1159 else // Now c != '\0', do LF-mapping:
1160 p = C[c]+alphabetrank_i_tmp-1;
1162 while (textId > 0 && notIndexed.isSet(textId-1))
1166 tsbuilder[tsb_i] = *nit_i;
1168 } while (nit_i != niText.rend() && *nit_i != '\0');
1171 assert(textId == 0);
1173 assert(nit_i == niText.rend());
1175 #ifdef DEBUG_MEMUSAGE
1176 std::cerr << "heap usage before tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
1177 HeapProfiler::ResetMaxHeapConsumption();
1180 textStorage = tsbuilder.InitTextStorage(tsType);
1182 #ifdef DEBUG_MEMUSAGE
1183 std::cerr << "heap usage after tsbuilder init: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
1184 HeapProfiler::ResetMaxHeapConsumption();
1187 sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1188 delete [] sampledpositions;
1189 assert(sampleCount == sampleLength);
1190 assert(sampled->rank1(n-1) == sampleLength);
1192 #ifdef DEBUG_MEMUSAGE
1193 std::cerr << "heap usage after sampled bit vector: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
1194 HeapProfiler::ResetMaxHeapConsumption();
1197 // Suffixes store an offset from the text start position
1198 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1199 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1201 x = n + niText.length() - 2;
1202 textId = numberOfTexts - 1;
1203 posOfSuccEndmarker = x + 1;
1204 for(ulong i = 0; i < sampleLength; i ++) {
1205 // Find next sampled text position
1206 while ((posOfSuccEndmarker - x) % samplerate != 0
1207 || notIndexed.isSet(textId)) // Loop over non-indexed
1211 if (textStorage->IsEndmarker(x))
1213 posOfSuccEndmarker = x--;
1217 assert((*positions)[i] < n);
1218 ulong j = sampled->rank1((*positions)[i]);
1220 assert(j != 0); // if (j==0) j=sampleLength;
1222 TextPosition textPos = (x==n-1)?0:x+1;
1223 (*suffixDocId)[j-1] = textId; // textStorage->DocIdAtTextPos(textPos);
1224 assert(textStorage->DocIdAtTextPos(textPos) == textId);
1226 assert((*suffixDocId)[j-1] < numberOfTexts);
1227 // calculate offset from text start:
1228 (*suffixes)[j-1] = textPos - textStorage->TextStartPos((*suffixDocId)[j-1]);
1230 if (x != ~0lu && textStorage->IsEndmarker(x))
1232 posOfSuccEndmarker = x--;
1239 #ifdef DEBUG_MEMUSAGE
1240 std::cerr << "heap usage after sampled arrays: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " / " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes, " << HeapProfiler::GetHeapConsumption() << " / " << HeapProfiler::GetMaxHeapConsumption() << std::endl;
1241 HeapProfiler::ResetMaxHeapConsumption();
1244 #ifdef DEBUG_MEMUSAGE
1245 std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1246 HeapProfiler::ResetMaxHeapConsumption();
1249 /*alphabet_mapper * am = new alphabet_mapper_none();
1250 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(32); // FIXME samplerate?
1251 Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, logNumberOfTexts, bmb, am, true);
1253 // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1255 Doc = new ArrayDoc(endmarkerDocId);
1257 #ifdef DEBUG_MEMUSAGE
1258 std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1264 * Finds document identifier for given text position
1266 * Starting text position of the document is stored into second parameter.
1267 * Binary searching on text starting positions.
1269 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1274 DocId b = numberOfTexts - 1;
1277 DocId c = a + (b - a)/2;
1278 if ((*textStartPos)[c] > i)
1280 else if ((*textStartPos)[c+1] > i)
1286 assert(a < (DocId)numberOfTexts);
1287 assert(i >= (*textStartPos)[a]);
1288 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));