1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
23 #include "HeapProfiler.h" // FIXME remove
33 #include <cstring> // For strlen()
42 // Save file version info
43 const uchar TCImplementation::versionFlag = 4;
46 * Constructor inits an empty dynamic FM-index.
47 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
49 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
50 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
51 suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
53 makewavelet(bwt); // Deletes bwt!
56 // Make sampling tables
60 bool TCImplementation::EmptyText(DocId k) const
62 assert(k < (DocId)numberOfTexts);
63 return false; // Empty texts are not indexed
66 uchar* TCImplementation::GetText(DocId k) const
68 assert(k < (DocId)numberOfTexts);
72 // Reserve average string length to avoid reallocs
73 result.reserve(n/numberOfTexts);
75 uchar c = alphabetrank->access(i);
79 i = C[c]+alphabetrank->rank(c,i)-1;
81 c = alphabetrank->access(i); // "next" char.
84 // Convert to uchar (FIXME return string?)
86 uchar* res = new uchar[i+1];
88 for (ulong j = 0; j < i; ++j)
89 res[i-j-1] = result[j];
94 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
96 assert(k < (DocId)numberOfTexts);
97 assert(j < (*textLength)[k]);
102 // Start position of k'th text
103 ulong start = (*textStartPos)[k];
105 return Substring(i + start, j-i+1);
108 /******************************************************************
109 * Existential queries
111 bool TCImplementation::IsPrefix(uchar const * pattern) const
113 TextPosition m = strlen((char *)pattern);
117 TextPosition sp = 0, ep = 0;
118 Search(pattern, m, &sp, &ep);
120 // Check for end-marker(s) in result interval
121 if (CountEndmarkers(sp, ep))
126 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
128 TextPosition m = strlen((char *)pattern);
132 TextPosition sp = 0, ep = 0;
133 Search(pattern, m, &sp, &ep);
135 // Check for end-marker(s) in result interval
136 if (CountEndmarkers(sp, ep, begin, end))
142 bool TCImplementation::IsSuffix(uchar const *pattern) const
144 // Here counting is as fast as IsSuffix():
145 if (CountSuffix(pattern) > 0)
150 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
152 // Here counting is as fast as IsSuffix():
153 if (CountSuffix(pattern, begin, end) > 0)
158 bool TCImplementation::IsEqual(uchar const *pattern) const
160 TextPosition m = std::strlen((char *)pattern);
162 return false; // No empty texts exists
164 TextPosition sp = 0, ep = 0;
165 // Match including end-marker
166 Search(pattern, m+1, &sp, &ep);
168 // Check for end-marker(s) in result interval
169 if (CountEndmarkers(sp, ep))
174 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
176 TextPosition m = std::strlen((char *)pattern);
178 return false; // No empty texts exists
180 TextPosition sp = 0, ep = 0;
181 // Match including end-marker
182 Search(pattern, m+1, &sp, &ep, begin, end);
184 // Check for end-marker(s) in result interval
185 if (CountEndmarkers(sp, ep))
190 bool TCImplementation::IsContains(uchar const * pattern) const
192 TextPosition m = strlen((char *)pattern);
196 TextPosition sp = 0, ep = 0;
197 // Just check if pattern exists somewhere
198 ulong count = Search(pattern, m, &sp, &ep);
205 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
207 // Here counting is as fast as existential querying
208 if (CountContains(pattern, begin, end) > 0)
209 return true; // FIXME No need to filter result set
213 bool TCImplementation::IsLessThan(uchar const * pattern) const
215 if (CountLessThan(pattern) > 0)
220 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
222 if (CountLessThan(pattern, begin, end) > 0)
227 /******************************************************************
230 ulong TCImplementation::Count(uchar const * pattern) const
232 TextPosition m = strlen((char *)pattern);
236 TextPosition sp = 0, ep = 0;
237 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
241 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
243 TextPosition m = strlen((char *)pattern);
245 return numberOfTexts;
247 TextPosition sp = 0, ep = 0;
248 Search(pattern, m, &sp, &ep);
250 // Count end-markers in result interval
251 return CountEndmarkers(sp, ep);
254 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
256 TextPosition m = strlen((char *)pattern);
258 return numberOfTexts;
260 TextPosition sp = 0, ep = 0;
261 Search(pattern, m, &sp, &ep);
263 // Count end-markers in result interval
264 return CountEndmarkers(sp, ep, begin, end);
267 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
269 TextPosition m = strlen((char *)pattern);
271 return numberOfTexts;
273 TextPosition sp = 0, ep = 0;
274 // Search with end-marker
275 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
280 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
282 TextPosition m = strlen((char *)pattern);
284 return numberOfTexts;
286 TextPosition sp = 0, ep = 0;
287 // Search with end-marker
288 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
293 unsigned TCImplementation::CountEqual(uchar const *pattern) const
295 TextPosition m = strlen((char const *)pattern);
297 return 0; // No empty texts.
299 TextPosition sp = 0, ep = 0;
300 // Match including end-marker
301 Search(pattern, m+1, &sp, &ep);
303 // Count end-markers in result interval
304 return CountEndmarkers(sp, ep);
307 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
309 TextPosition m = strlen((char const *)pattern);
311 return 0; // No empty texts.
313 TextPosition sp = 0, ep = 0;
314 // Match including end-marker
315 Search(pattern, m+1, &sp, &ep, begin, end);
317 // Count end-markers in result interval
318 return CountEndmarkers(sp, ep); // Already within [begin, end]
321 unsigned TCImplementation::CountContains(uchar const * pattern) const
323 TextPosition m = strlen((char const *)pattern);
325 return numberOfTexts; // Total number of texts.
327 // Here counting is as slow as fetching the result set
328 // because we have to filter out occ's that fall within same document.
329 TextCollection::document_result result = Contains(pattern);
330 return result.size();
333 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
335 TextPosition m = strlen((char const *)pattern);
337 return numberOfTexts; // Total number of texts.
339 // Here counting is as slow as fetching the result set
340 // because we have to filter out occ's that fall within same document.
341 TextCollection::document_result result = Contains(pattern, begin, end);
342 return result.size();
345 // Less than or equal
346 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
348 TextPosition m = strlen((char const *)pattern);
350 return 0; // No empty texts.
352 TextPosition sp = 0, ep = 0;
353 SearchLessThan(pattern, m, &sp, &ep);
355 // Count end-markers in result interval
356 return CountEndmarkers(sp, ep);
359 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
361 TextPosition m = strlen((char const *)pattern);
363 return 0; // No empty texts.
365 TextPosition sp = 0, ep = 0;
366 SearchLessThan(pattern, m, &sp, &ep);
368 // Count end-markers in result interval
369 return CountEndmarkers(sp, ep, begin, end);
373 * Document reporting queries
375 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
377 TextPosition m = strlen((char *)pattern);
379 return TextCollection::document_result(); // FIXME Should return all 1...k
381 TextPosition sp = 0, ep = 0;
382 Search(pattern, m, &sp, &ep);
384 // Iterate through end-markers in [sp,ep]:
385 return EnumerateEndmarkers(sp, ep);
388 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
390 TextPosition m = strlen((char *)pattern);
392 return TextCollection::document_result(); // FIXME Should return all 1...k
394 TextPosition sp = 0, ep = 0;
395 Search(pattern, m, &sp, &ep);
397 // Return end-markers in [sp,ep] and [begin, end]:
398 return EnumerateEndmarkers(sp, ep, begin, end);
401 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
403 TextPosition m = strlen((char *)pattern);
405 return TextCollection::document_result(); // FIXME Should return all 1...k
407 TextPosition sp = 0, ep = 0;
408 // Search with end-marker
409 Search(pattern, m+1, &sp, &ep);
411 TextCollection::document_result result;
412 result.reserve(ep-sp+1); // Try to avoid reallocation.
414 // Check each occurrence
415 for (; sp <= ep; ++sp)
419 uchar c = alphabetrank->access(i);
420 while (c != '\0' && !sampled->access(i))
422 i = C[c]+alphabetrank->rank(c,i)-1;
423 c = alphabetrank->access(i);
425 // Assert: c == '\0' OR sampled->IsBitSet(i)
429 // Rank among the end-markers in BWT
430 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
431 result.push_back(Doc->access(endmarkerRank));
433 else // Sampled position
435 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
436 result.push_back(docId);
443 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
445 TextPosition m = strlen((char *)pattern);
447 return TextCollection::document_result(); // FIXME Should return all 1...k
449 TextPosition sp = 0, ep = 0;
450 // Search with end-marker
451 Search(pattern, m+1, &sp, &ep, begin, end);
453 TextCollection::document_result result;
454 result.reserve(ep-sp+1); // Try to avoid reallocation.
456 // Check each occurrence, already within [begin, end]
457 for (; sp <= ep; ++sp)
461 uchar c = alphabetrank->access(i);
462 while (c != '\0' && !sampled->access(i))
464 i = C[c]+alphabetrank->rank(c,i)-1;
465 c = alphabetrank->access(i);
467 // Assert: c == '\0' OR sampled->IsBitSet(i)
471 // Rank among the end-markers in BWT
472 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
473 result.push_back(Doc->access(endmarkerRank));
475 else // Sampled position
477 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
478 result.push_back(docId);
486 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
488 TextPosition m = strlen((char const *)pattern);
490 return TextCollection::document_result(); // FIXME Should return all empty texts
492 TextPosition sp = 0, ep = 0;
493 // Match including end-marker
494 Search(pattern, m+1, &sp, &ep);
496 // Report end-markers in result interval
497 return EnumerateEndmarkers(sp, ep);
500 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
502 TextPosition m = strlen((char const *)pattern);
504 return TextCollection::document_result(); // FIXME Should return all empty texts
506 TextPosition sp = 0, ep = 0;
507 // Match including end-marker
508 Search(pattern, m+1, &sp, &ep, begin, end);
510 // Report end-markers in result interval
511 return EnumerateEndmarkers(sp, ep, begin, end);
515 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
517 TextPosition m = strlen((char *)pattern);
519 return TextCollection::document_result();
521 TextPosition sp = 0, ep = 0;
522 // Search all occurrences
523 Search(pattern, m, &sp, &ep);
525 // We want unique document indentifiers, using std::set to collect them
526 std::set<DocId> resultSet;
528 // Check each occurrence
529 for (; sp <= ep; ++sp)
532 uchar c = alphabetrank->access(i);
533 while (c != '\0' && !sampled->access(i))
535 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
536 c = alphabetrank->access(i);
540 // Rank among the end-markers in BWT
541 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
542 resultSet.insert(Doc->access(endmarkerRank));
546 DocId di = (*suffixDocId)[sampled->rank1(i)-1];
547 assert((unsigned)di < numberOfTexts);
548 resultSet.insert(di);
552 // Convert std::set to std::vector
553 TextCollection::document_result result(resultSet.begin(), resultSet.end());
557 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
559 TextPosition m = strlen((char *)pattern);
561 return TextCollection::document_result();
563 TextPosition sp = 0, ep = 0;
564 // Search all occurrences
565 Search(pattern, m, &sp, &ep);
567 // We want unique document indentifiers, using std::set to collect them
568 std::set<DocId> resultSet;
570 // Check each occurrence
571 for (; sp <= ep; ++sp)
574 uchar c = alphabetrank->access(i);
575 while (c != '\0' && !sampled->access(i))
577 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
578 c = alphabetrank->access(i);
582 // Rank among the end-markers in BWT
583 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
584 DocId docId = Doc->access(endmarkerRank);
585 if (docId >= begin && docId <= end)
586 resultSet.insert(docId);
590 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
591 assert((unsigned)docId < numberOfTexts);
592 if (docId >= begin && docId <= end)
593 resultSet.insert(docId);
597 // Convert std::set to std::vector
598 TextCollection::document_result result(resultSet.begin(), resultSet.end());
602 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
604 TextPosition m = strlen((char *)pattern);
606 return TextCollection::document_result(); // empty result set
608 TextPosition sp = 0, ep = 0;
609 SearchLessThan(pattern, m, &sp, &ep);
611 // Report end-markers in result interval
612 return EnumerateEndmarkers(sp, ep);
615 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
617 TextPosition m = strlen((char *)pattern);
619 return TextCollection::document_result(); // empty result set
621 TextPosition sp = 0, ep = 0;
622 SearchLessThan(pattern, m, &sp, &ep);
624 // Iterate through end-markers in [sp,ep] and [begin, end]:
625 return EnumerateEndmarkers(sp, ep, begin, end);
629 * Full result set queries
631 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
633 TextPosition m = strlen((char *)pattern);
635 return full_result(); // FIXME Throw exception?
637 TextPosition sp = 0, ep = 0;
638 // Search all occurrences
639 Search(pattern, m, &sp, &ep);
642 result.reserve(ep-sp+1); // Try to avoid reallocation.
644 // Report each occurrence
645 for (; sp <= ep; ++sp)
648 TextPosition dist = 0;
649 uchar c = alphabetrank->access(i);
650 while (c != '\0' && !sampled->access(i))
652 i = C[c]+alphabetrank->rank(c,i)-1;
653 c = alphabetrank->access(i);
658 // Rank among the end-markers in BWT
659 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
660 DocId docId = Doc->access(endmarkerRank);
661 result.push_back(make_pair(docId, dist));
665 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
666 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
668 result.push_back(make_pair(docId, textPos));
675 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
677 TextPosition m = strlen((char *)pattern);
679 return full_result(); // FIXME Throw exception?
681 TextPosition sp = 0, ep = 0;
682 // Search all occurrences
683 Search(pattern, m, &sp, &ep);
686 result.reserve(ep-sp+1); // Try to avoid reallocation.
688 // Report each occurrence
689 for (; sp <= ep; ++sp)
692 TextPosition dist = 0;
693 uchar c = alphabetrank->access(i);
694 while (c != '\0' && !sampled->access(i))
696 i = C[c]+alphabetrank->rank(c,i)-1;
697 c = alphabetrank->access(i);
702 // Rank among the end-markers in BWT
703 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
705 // End-marker that we found belongs to the "preceeding" doc in collection:
706 DocId docId = Doc->access(endmarkerRank);
707 if (docId >= begin && docId <= end)
708 result.push_back(make_pair(docId, dist));
712 TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
713 DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
715 if (docId >= begin && docId <= end)
716 result.push_back(make_pair(docId, textPos));
725 * Save index to a file handle
727 * Throws a std::runtime_error exception on i/o error.
728 * First byte that is saved represents the version number of the save file.
729 * In version 2 files, the data fields are:
734 TextPosition bwtEndPos;
735 static_sequence * alphabetrank;
737 BlockArray *suffixes;
738 BlockArray *suffixDocId;
739 unsigned numberOfTexts;
741 static_sequence *docId;
743 void TCImplementation::Save(FILE *file) const
745 // Saving version info:
746 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
747 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
749 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
750 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
751 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
752 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
754 for(ulong i = 0; i < 256; ++i)
755 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
756 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
758 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
759 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
761 alphabetrank->save(file);
763 suffixes->Save(file);
764 suffixDocId->Save(file);
766 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
767 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
768 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
769 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
777 * Load index from a file handle
779 * Throws a std::runtime_error exception on i/o error.
780 * For more info, see TCImplementation::Save().
782 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
783 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
784 suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
787 if (std::fread(&verFlag, 1, 1, file) != 1)
788 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
789 if (verFlag != TCImplementation::versionFlag)
790 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
792 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
793 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
794 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
795 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
796 // FIXME samplerate can not be changed during load.
797 // if (this->samplerate == 0)
798 // this->samplerate = samplerate;
800 for(ulong i = 0; i < 256; ++i)
801 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
802 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
804 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
805 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
807 alphabetrank = static_sequence::load(file);
808 sampled = static_bitsequence::load(file);
809 suffixes = new BlockArray(file);
810 suffixDocId = new BlockArray(file);
812 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
813 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
814 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
815 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
817 Doc = static_sequence::load(file);
819 // FIXME Construct data structures with new samplerate
826 * Rest of the functions follow...
831 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
833 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
834 int c = (int)pattern[m-1];
836 TextPosition sp = C[c];
837 TextPosition ep = C[c+1]-1;
838 while (sp<=ep && i>=1)
840 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
841 c = (int)pattern[--i];
842 sp = C[c]+alphabetrank->rank(c,sp-1);
843 ep = C[c]+alphabetrank->rank(c,ep)-1;
853 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
855 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
856 int c = (int)pattern[m-1];
857 assert(c == 0); // Start from endmarkers
859 TextPosition sp = begin;
860 TextPosition ep = end;
861 while (sp<=ep && i>=1)
863 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
864 c = (int)pattern[--i];
865 sp = C[c]+alphabetrank->rank(c,sp-1);
866 ep = C[c]+alphabetrank->rank(c,ep)-1;
877 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
879 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
880 uint c = (int)pattern[m-1];
883 TextPosition ep = C[c+1]-1;
884 while (sp<=ep && i>=1)
886 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
887 c = (int)pattern[--i];
888 uint result = alphabetrank->rankLessThan(c,ep);
903 TCImplementation::~TCImplementation() {
911 void TCImplementation::makewavelet(uchar *bwt)
920 if (C[i]>0) {min = i; break;}
921 for (i=255;i>=min;--i)
922 if (C[i]>0) {max = i; break;}
924 ulong prev=C[0], temp;
926 for (i=1;i<256;i++) {
931 // this->codetable = node::makecodetable(bwt,n);
932 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
934 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
935 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
937 #ifdef DEBUG_MEMUSAGE
938 std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
939 HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
942 alphabet_mapper * am = new alphabet_mapper_none();
943 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
944 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
945 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
947 bwt = 0; // already deleted
949 #ifdef DEBUG_MEMUSAGE
950 std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
951 std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
955 void TCImplementation::maketables()
957 // Calculate BWT end-marker position (of last inserted text)
960 uint alphabetrank_i_tmp = 0;
961 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
964 i = C[c]+alphabetrank_i_tmp-1;
965 c = alphabetrank->access(i, alphabetrank_i_tmp);
971 // Build up array for text starting positions
972 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
973 (*textStartPos)[0] = 0;
975 // Mapping from end-markers to doc ID's:
976 uint *endmarkerDocId = new uint[numberOfTexts]; // FIXME Use BlockArray with static_sequence_wvtree_noptrs.
978 uint *sampledpositions = new uint[n/(sizeof(uint)*8)+1];
979 for (ulong i = 0; i < n / (sizeof(uint)*8) + 1; i++)
980 sampledpositions[i] = 0;
983 ulong sampleCount = 0;
984 // Keeping track of text position of prev. end-marker seen
985 ulong posOfSuccEndmarker = n;
986 DocId textId = numberOfTexts;
989 uint alphabetrank_i_tmp =0;
992 * First pass: populate tables textStartPos and sampledpositions.
994 for (ulong i=n-1;i<ulongmax;i--) {
995 // i substitutes SA->GetPos(i)
998 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
999 set_field(sampledpositions,1,p,1);
1003 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1008 // Record the order of end-markers in BWT:
1009 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1010 endmarkerDocId[endmarkerRank] = (textId + 1) % numberOfTexts;
1012 // Store text length and text start position:
1013 if (textId < (DocId)numberOfTexts - 1)
1015 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1016 posOfSuccEndmarker = x;
1019 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1020 p = textId; // Correct LF-mapping to the last char of the previous text.
1022 else // Now c != '\0', do LF-mapping:
1023 p = C[c]+alphabetrank_i_tmp-1;
1025 assert(textId == 0);
1027 sampled = new static_bitsequence_rrr02(sampledpositions, n, 16);
1028 delete [] sampledpositions;
1029 ulong sampleLength = sampled->rank1(n-1);
1030 assert(sampleCount == sampleLength);
1032 // Suffixes store an offset from the text start position
1033 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1034 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1037 textId = numberOfTexts;
1040 * Second pass: populate tables suffixes and suffixDocId.
1042 for (ulong i=n-1;i<ulongmax;i--) {
1045 if (sampled->access(p)) {
1046 ulong j = sampled->rank1(p)-1;
1048 (*suffixDocId)[j] = DocIdAtTextPos(textStartPos, x);
1050 // calculate offset from text start:
1051 (*suffixes)[j] = x - (*textStartPos)[(*suffixDocId)[j]];
1054 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1058 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1059 p = textId; // Correct LF-mapping to the last char of the previous text.
1061 else // Now c != '\0', do LF-mapping:
1062 p = C[c]+alphabetrank_i_tmp-1;
1064 assert(textId == 0);
1066 delete textStartPos;
1068 #ifdef DEBUG_MEMUSAGE
1069 std::cerr << "max heap usage before Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1070 HeapProfiler::ResetMaxHeapConsumption();
1073 alphabet_mapper * am = new alphabet_mapper_none();
1074 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1075 Doc = new static_sequence_wvtree_noptrs(endmarkerDocId, numberOfTexts, bmb, am, true);
1077 // delete [] endmarkerDocId; // already deleted in static_sequence_wvtree_noptrs!
1079 #ifdef DEBUG_MEMUSAGE
1080 std::cerr << "max heap usage after Doc: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1086 * Finds document identifier for given text position
1088 * Starting text position of the document is stored into second parameter.
1089 * Binary searching on text starting positions.
1091 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1096 DocId b = numberOfTexts - 1;
1099 DocId c = a + (b - a)/2;
1100 if ((*textStartPos)[c] > i)
1102 else if ((*textStartPos)[c+1] > i)
1108 assert(a < (DocId)numberOfTexts);
1109 assert(i >= (*textStartPos)[a]);
1110 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));