1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
22 //#define DEBUG_MEMUSAGE
24 #include "HeapProfiler.h" // FIXME remove
34 #include <cstring> // For strlen()
43 // Save file version info
44 const uchar TCImplementation::versionFlag = 6;
47 * Constructor inits an empty dynamic FM-index.
48 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
50 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
51 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
52 suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
54 makewavelet(bwt); // Deletes bwt!
57 // Make sampling tables
61 bool TCImplementation::EmptyText(DocId k) const
63 assert(k < (DocId)numberOfTexts);
64 return false; // Empty texts are not indexed
67 uchar * TCImplementation::GetText(DocId k) const
69 assert(k < (DocId)numberOfTexts);
71 return textStorage->GetText(k);
72 /* TextPosition i = k;
75 // Reserve average string length to avoid reallocs
76 result.reserve(n/numberOfTexts);
78 uchar c = alphabetrank->access(i);
82 i = C[c]+alphabetrank->rank(c,i)-1;
84 c = alphabetrank->access(i); // "next" char.
87 // Convert to uchar (FIXME return string?)
89 uchar* res = new uchar[i+1];
91 for (ulong j = 0; j < i; ++j)
92 res[i-j-1] = result[j];
97 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
99 assert(k < (DocId)numberOfTexts);
100 assert(j < (*textLength)[k]);
105 // Start position of k'th text
106 ulong start = (*textStartPos)[k];
108 return Substring(i + start, j-i+1);
111 /******************************************************************
112 * Existential queries
114 bool TCImplementation::IsPrefix(uchar const * pattern) const
116 TextPosition m = strlen((char *)pattern);
120 TextPosition sp = 0, ep = 0;
121 Search(pattern, m, &sp, &ep);
123 // Check for end-marker(s) in result interval
124 if (CountEndmarkers(sp, ep))
129 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
131 TextPosition m = strlen((char *)pattern);
135 TextPosition sp = 0, ep = 0;
136 Search(pattern, m, &sp, &ep);
138 // Check for end-marker(s) in result interval
139 if (CountEndmarkers(sp, ep, begin, end))
145 bool TCImplementation::IsSuffix(uchar const *pattern) const
147 // Here counting is as fast as IsSuffix():
148 if (CountSuffix(pattern) > 0)
153 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
155 // Here counting is as fast as IsSuffix():
156 if (CountSuffix(pattern, begin, end) > 0)
161 bool TCImplementation::IsEqual(uchar const *pattern) const
163 TextPosition m = std::strlen((char *)pattern);
165 return false; // No empty texts exists
167 TextPosition sp = 0, ep = 0;
168 // Match including end-marker
169 Search(pattern, m+1, &sp, &ep);
171 // Check for end-marker(s) in result interval
172 if (CountEndmarkers(sp, ep))
177 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
179 TextPosition m = std::strlen((char *)pattern);
181 return false; // No empty texts exists
183 TextPosition sp = 0, ep = 0;
184 // Match including end-marker
185 Search(pattern, m+1, &sp, &ep, begin, end);
187 // Check for end-marker(s) in result interval
188 if (CountEndmarkers(sp, ep))
193 bool TCImplementation::IsContains(uchar const * pattern) const
195 TextPosition m = strlen((char *)pattern);
199 TextPosition sp = 0, ep = 0;
200 // Just check if pattern exists somewhere
201 ulong count = Search(pattern, m, &sp, &ep);
208 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
210 // Here counting is as fast as existential querying
211 if (CountContains(pattern, begin, end) > 0)
212 return true; // FIXME No need to filter result set
216 bool TCImplementation::IsLessThan(uchar const * pattern) const
218 if (CountLessThan(pattern) > 0)
223 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
225 if (CountLessThan(pattern, begin, end) > 0)
230 /******************************************************************
233 ulong TCImplementation::Count(uchar const * pattern) const
235 TextPosition m = strlen((char *)pattern);
239 TextPosition sp = 0, ep = 0;
240 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
244 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
246 TextPosition m = strlen((char *)pattern);
248 return numberOfTexts;
250 TextPosition sp = 0, ep = 0;
251 Search(pattern, m, &sp, &ep);
253 // Count end-markers in result interval
254 return CountEndmarkers(sp, ep);
257 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
259 TextPosition m = strlen((char *)pattern);
261 return numberOfTexts;
263 TextPosition sp = 0, ep = 0;
264 Search(pattern, m, &sp, &ep);
266 // Count end-markers in result interval
267 return CountEndmarkers(sp, ep, begin, end);
270 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
272 TextPosition m = strlen((char *)pattern);
274 return numberOfTexts;
276 TextPosition sp = 0, ep = 0;
277 // Search with end-marker
278 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
283 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
285 TextPosition m = strlen((char *)pattern);
287 return numberOfTexts;
289 TextPosition sp = 0, ep = 0;
290 // Search with end-marker
291 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
296 unsigned TCImplementation::CountEqual(uchar const *pattern) const
298 TextPosition m = strlen((char const *)pattern);
300 return 0; // No empty texts.
302 TextPosition sp = 0, ep = 0;
303 // Match including end-marker
304 Search(pattern, m+1, &sp, &ep);
306 // Count end-markers in result interval
307 return CountEndmarkers(sp, ep);
310 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
312 TextPosition m = strlen((char const *)pattern);
314 return 0; // No empty texts.
316 TextPosition sp = 0, ep = 0;
317 // Match including end-marker
318 Search(pattern, m+1, &sp, &ep, begin, end);
320 // Count end-markers in result interval
321 return CountEndmarkers(sp, ep); // Already within [begin, end]
324 unsigned TCImplementation::CountContains(uchar const * pattern) const
326 TextPosition m = strlen((char const *)pattern);
328 return numberOfTexts; // Total number of texts.
330 // Here counting is as slow as fetching the result set
331 // because we have to filter out occ's that fall within same document.
332 TextCollection::document_result result = Contains(pattern);
333 return result.size();
336 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
338 TextPosition m = strlen((char const *)pattern);
340 return numberOfTexts; // Total number of texts.
342 // Here counting is as slow as fetching the result set
343 // because we have to filter out occ's that fall within same document.
344 TextCollection::document_result result = Contains(pattern, begin, end);
345 return result.size();
348 // Less than or equal
349 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
351 TextPosition m = strlen((char const *)pattern);
353 return 0; // No empty texts.
355 TextPosition sp = 0, ep = 0;
356 SearchLessThan(pattern, m, &sp, &ep);
358 // Count end-markers in result interval
359 return CountEndmarkers(sp, ep);
362 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
364 TextPosition m = strlen((char const *)pattern);
366 return 0; // No empty texts.
368 TextPosition sp = 0, ep = 0;
369 SearchLessThan(pattern, m, &sp, &ep);
371 // Count end-markers in result interval
372 return CountEndmarkers(sp, ep, begin, end);
376 * Document reporting queries
378 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
380 TextPosition m = strlen((char *)pattern);
382 return TextCollection::document_result(); // FIXME Should return all 1...k
384 TextPosition sp = 0, ep = 0;
385 Search(pattern, m, &sp, &ep);
387 // Iterate through end-markers in [sp,ep]:
388 return EnumerateEndmarkers(sp, ep);
391 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
393 TextPosition m = strlen((char *)pattern);
395 return TextCollection::document_result(); // FIXME Should return all 1...k
397 TextPosition sp = 0, ep = 0;
398 Search(pattern, m, &sp, &ep);
400 // Return end-markers in [sp,ep] and [begin, end]:
401 return EnumerateEndmarkers(sp, ep, begin, end);
404 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
406 TextPosition m = strlen((char *)pattern);
408 return TextCollection::document_result(); // FIXME Should return all 1...k
410 TextPosition sp = 0, ep = 0;
411 // Search with end-marker
412 Search(pattern, m+1, &sp, &ep);
414 TextCollection::document_result result;
415 result.reserve(ep-sp+1); // Try to avoid reallocation.
417 // Check each occurrence
418 for (; sp <= ep; ++sp)
422 uchar c = alphabetrank->access(i);
423 while (c != '\0' && !sampled->access(i))
425 i = C[c]+alphabetrank->rank(c,i)-1;
426 c = alphabetrank->access(i);
428 // Assert: c == '\0' OR sampled->IsBitSet(i)
432 // Rank among the end-markers in BWT
433 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
434 result.push_back(Doc->access(endmarkerRank));
436 else // Sampled position
438 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
439 result.push_back(docId);
446 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
448 TextPosition m = strlen((char *)pattern);
450 return TextCollection::document_result(); // FIXME Should return all 1...k
452 TextPosition sp = 0, ep = 0;
453 // Search with end-marker
454 Search(pattern, m+1, &sp, &ep, begin, end);
456 TextCollection::document_result result;
457 result.reserve(ep-sp+1); // Try to avoid reallocation.
459 // Check each occurrence, already within [begin, end]
460 for (; sp <= ep; ++sp)
464 uchar c = alphabetrank->access(i);
465 while (c != '\0' && !sampled->access(i))
467 i = C[c]+alphabetrank->rank(c,i)-1;
468 c = alphabetrank->access(i);
470 // Assert: c == '\0' OR sampled->IsBitSet(i)
474 // Rank among the end-markers in BWT
475 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
476 result.push_back(Doc->access(endmarkerRank));
478 else // Sampled position
480 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
481 result.push_back(docId);
489 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
491 TextPosition m = strlen((char const *)pattern);
493 return TextCollection::document_result(); // FIXME Should return all empty texts
495 TextPosition sp = 0, ep = 0;
496 // Match including end-marker
497 Search(pattern, m+1, &sp, &ep);
499 // Report end-markers in result interval
500 return EnumerateEndmarkers(sp, ep);
503 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
505 TextPosition m = strlen((char const *)pattern);
507 return TextCollection::document_result(); // FIXME Should return all empty texts
509 TextPosition sp = 0, ep = 0;
510 // Match including end-marker
511 Search(pattern, m+1, &sp, &ep, begin, end);
513 // Report end-markers in result interval
514 return EnumerateEndmarkers(sp, ep, begin, end);
518 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
520 TextPosition m = strlen((char *)pattern);
522 return TextCollection::document_result();
524 TextPosition sp = 0, ep = 0;
525 // Search all occurrences
526 Search(pattern, m, &sp, &ep);
528 // We want unique document indentifiers, using std::set to collect them
529 std::set<DocId> resultSet;
531 // Check each occurrence
532 for (; sp <= ep; ++sp)
535 uchar c = alphabetrank->access(i);
536 while (c != '\0' && !sampled->access(i))
538 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
539 c = alphabetrank->access(i);
543 // Rank among the end-markers in BWT
544 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
545 resultSet.insert(Doc->access(endmarkerRank));
549 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
550 assert((unsigned)di < numberOfTexts);
551 resultSet.insert(di);
555 // Convert std::set to std::vector
556 TextCollection::document_result result(resultSet.begin(), resultSet.end());
560 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
562 TextPosition m = strlen((char *)pattern);
564 return TextCollection::document_result();
566 TextPosition sp = 0, ep = 0;
567 // Search all occurrences
568 Search(pattern, m, &sp, &ep);
570 // We want unique document indentifiers, using std::set to collect them
571 std::set<DocId> resultSet;
573 // Check each occurrence
574 for (; sp <= ep; ++sp)
577 uchar c = alphabetrank->access(i);
578 while (c != '\0' && !sampled->access(i))
580 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
581 c = alphabetrank->access(i);
585 // Rank among the end-markers in BWT
586 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
587 DocId docId = Doc->access(endmarkerRank);
588 if (docId >= begin && docId <= end)
589 resultSet.insert(docId);
593 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
594 assert((unsigned)docId < numberOfTexts);
595 if (docId >= begin && docId <= end)
596 resultSet.insert(docId);
600 // Convert std::set to std::vector
601 TextCollection::document_result result(resultSet.begin(), resultSet.end());
605 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
607 TextPosition m = strlen((char *)pattern);
609 return TextCollection::document_result(); // empty result set
611 TextPosition sp = 0, ep = 0;
612 SearchLessThan(pattern, m, &sp, &ep);
614 // Report end-markers in result interval
615 return EnumerateEndmarkers(sp, ep);
618 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
620 TextPosition m = strlen((char *)pattern);
622 return TextCollection::document_result(); // empty result set
624 TextPosition sp = 0, ep = 0;
625 SearchLessThan(pattern, m, &sp, &ep);
627 // Iterate through end-markers in [sp,ep] and [begin, end]:
628 return EnumerateEndmarkers(sp, ep, begin, end);
632 * Full result set queries
634 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
636 TextPosition m = strlen((char *)pattern);
638 return full_result(); // FIXME Throw exception?
640 TextPosition sp = 0, ep = 0;
641 // Search all occurrences
642 Search(pattern, m, &sp, &ep);
645 result.reserve(ep-sp+1); // Try to avoid reallocation.
647 // Report each occurrence
648 for (; sp <= ep; ++sp)
651 TextPosition dist = 0;
652 uchar c = alphabetrank->access(i);
653 while (c != '\0' && !sampled->access(i))
655 i = C[c]+alphabetrank->rank(c,i)-1;
656 c = alphabetrank->access(i);
661 // Rank among the end-markers in BWT
662 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
663 DocId docId = Doc->access(endmarkerRank);
664 result.push_back(make_pair(docId, dist));
668 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
669 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
671 result.push_back(make_pair(docId, textPos));
678 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
680 TextPosition m = strlen((char *)pattern);
682 return full_result(); // FIXME Throw exception?
684 TextPosition sp = 0, ep = 0;
685 // Search all occurrences
686 Search(pattern, m, &sp, &ep);
689 result.reserve(ep-sp+1); // Try to avoid reallocation.
691 // Report each occurrence
692 for (; sp <= ep; ++sp)
695 TextPosition dist = 0;
696 uchar c = alphabetrank->access(i);
697 while (c != '\0' && !sampled->access(i))
699 i = C[c]+alphabetrank->rank(c,i)-1;
700 c = alphabetrank->access(i);
705 // Rank among the end-markers in BWT
706 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
708 // End-marker that we found belongs to the "preceeding" doc in collection:
709 DocId docId = Doc->access(endmarkerRank);
710 if (docId >= begin && docId <= end)
711 result.push_back(make_pair(docId, dist));
715 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
716 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
718 if (docId >= begin && docId <= end)
719 result.push_back(make_pair(docId, textPos));
728 * Save index to a file handle
730 * Throws a std::runtime_error exception on i/o error.
731 * First byte that is saved represents the version number of the save file.
732 * In version 2 files, the data fields are:
737 TextPosition bwtEndPos;
738 static_sequence * alphabetrank;
740 BlockArray *suffixes;
741 BlockArray *suffixDocId;
742 unsigned numberOfTexts;
744 static_sequence *docId;
746 void TCImplementation::Save(FILE *file) const
748 // Saving version info:
749 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
750 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
752 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
753 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
754 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
755 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
757 for(ulong i = 0; i < 256; ++i)
758 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
759 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
761 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
762 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
764 alphabetrank->save(file);
766 suffixes->Save(file);
767 suffixDocId->Save(file);
769 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
770 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
771 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
772 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
775 textStorage->Save(file);
781 * Load index from a file handle
783 * Throws a std::runtime_error exception on i/o error.
784 * For more info, see TCImplementation::Save().
786 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
787 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
788 suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
791 if (std::fread(&verFlag, 1, 1, file) != 1)
792 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
793 if (verFlag != TCImplementation::versionFlag)
794 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
796 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
797 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
798 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
799 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
800 // FIXME samplerate can not be changed during load.
801 // if (this->samplerate == 0)
802 // this->samplerate = samplerate;
804 for(ulong i = 0; i < 256; ++i)
805 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
806 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
808 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
809 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
811 alphabetrank = static_sequence::load(file);
812 sampled = static_bitsequence::load(file);
813 suffixes = new BlockArray(file);
814 suffixDocId = new BlockArray(file);
816 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
817 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
818 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
819 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
821 Doc = static_sequence::load(file);
822 textStorage = new TextStorage(file);
824 // FIXME Construct data structures with new samplerate
831 * Rest of the functions follow...
836 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
838 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
839 int c = (int)pattern[m-1];
841 TextPosition sp = C[c];
842 TextPosition ep = C[c+1]-1;
843 while (sp<=ep && i>=1)
845 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
846 c = (int)pattern[--i];
847 sp = C[c]+alphabetrank->rank(c,sp-1);
848 ep = C[c]+alphabetrank->rank(c,ep)-1;
858 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
860 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
861 int c = (int)pattern[m-1];
862 assert(c == 0); // Start from endmarkers
864 TextPosition sp = begin;
865 TextPosition ep = end;
866 while (sp<=ep && i>=1)
868 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
869 c = (int)pattern[--i];
870 sp = C[c]+alphabetrank->rank(c,sp-1);
871 ep = C[c]+alphabetrank->rank(c,ep)-1;
882 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
884 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
885 uint c = (int)pattern[m-1];
888 TextPosition ep = C[c+1]-1;
889 while (sp<=ep && i>=1)
891 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
892 c = (int)pattern[--i];
893 uint result = alphabetrank->rankLessThan(c,ep);
908 TCImplementation::~TCImplementation() {
917 void TCImplementation::makewavelet(uchar *bwt)
926 if (C[i]>0) {min = i; break;}
927 for (i=255;i>=min;--i)
928 if (C[i]>0) {max = i; break;}
930 ulong prev=C[0], temp;
932 for (i=1;i<256;i++) {
937 // this->codetable = node::makecodetable(bwt,n);
938 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
940 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
941 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
943 #ifdef DEBUG_MEMUSAGE
944 std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
945 HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
948 alphabet_mapper * am = new alphabet_mapper_none();
949 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
950 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
951 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
953 bwt = 0; // already deleted
955 #ifdef DEBUG_MEMUSAGE
956 std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
957 std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
961 void TCImplementation::maketables()
963 // Calculate BWT end-marker position (of last inserted text)
966 uint alphabetrank_i_tmp = 0;
967 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
970 i = C[c]+alphabetrank_i_tmp-1;
971 c = alphabetrank->access(i, alphabetrank_i_tmp);
977 // Build up array for text starting positions
978 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
979 (*textStartPos)[0] = 0;
981 // Mapping from end-markers to doc ID's:
982 uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs.
984 uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
985 for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
986 sampledpositions[i] = 0;
989 ulong sampleCount = 0;
990 // Keeping track of text position of prev. end-marker seen
991 ulong posOfSuccEndmarker = n;
992 DocId textId = numberOfTexts;
995 uint alphabetrank_i_tmp =0;
998 * First pass: populate tables textStartPos and sampledpositions.
1000 for (ulong i=n-1;i<ulongmax;i--) {
1001 // i substitutes SA->GetPos(i)
1004 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1005 set_field(sampledpositions,1,p,1);
1009 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1014 // Record the order of end-markers in BWT:
1015 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1016 endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts;
1018 // Store text length and text start position:
1019 if (textId < (DocId)numberOfTexts - 1)
1021 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1022 posOfSuccEndmarker = x;
1025 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1026 p = textId; // Correct LF-mapping to the last char of the previous text.
1028 else // Now c != '\0', do LF-mapping:
1029 p = C[c]+alphabetrank_i_tmp-1;
1031 assert(textId == 0);
1033 sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1034 delete [] sampledpositions;
1035 ulong sampleLength = sampled->rank1(n-1);
1036 assert(sampleCount == sampleLength);
1038 // Suffixes store an offset from the text start position
1039 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1040 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1043 textId = numberOfTexts;
1045 TextStorageBuilder tsbuilder(n);
1048 * Second pass: populate tables suffixes and suffixDocId.
1050 for (ulong i=n-1;i<ulongmax;i--) {
1053 if (sampled->access(p)) {
1054 ulong j = sampled->rank1(p)-1;
1056 (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x);
1058 // calculate offset from text start:
1059 (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]];
1062 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1068 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1069 p = textId; // Correct LF-mapping to the last char of the previous text.
1071 else // Now c != '\0', do LF-mapping:
1072 p = C[c]+alphabetrank_i_tmp-1;
1074 assert(textId == 0);
1075 delete textStartPos;
1077 textStorage = tsbuilder.InitTextStorage();
1079 #ifdef DEBUG_MEMUSAGE
1080 std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1081 HeapProfiler::ResetMaxHeapConsumption();
1084 alphabet_mapper * am = new alphabet_mapper_none();
1085 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1086 Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true);
1088 // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1090 #ifdef DEBUG_MEMUSAGE
1091 std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1097 * Finds document identifier for given text position
1099 * Starting text position of the document is stored into second parameter.
1100 * Binary searching on text starting positions.
1102 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1107 DocId b = numberOfTexts - 1;
1110 DocId c = a + (b - a)/2;
1111 if ((*textStartPos)[c] > i)
1113 else if ((*textStartPos)[c+1] > i)
1119 assert(a < (DocId)numberOfTexts);
1120 assert(i >= (*textStartPos)[a]);
1121 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));