1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
22 //#include "HeapProfiler.h" // FIXME remove
31 #include <cstring> // For strlen()
40 // Save file version info
41 const uchar TCImplementation::versionFlag = 4;
44 * Constructor inits an empty dynamic FM-index.
45 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
47 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
48 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
49 suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
51 makewavelet(bwt); // Deletes bwt!
54 // Make sampling tables
58 bool TCImplementation::EmptyText(DocId k) const
60 assert(k < (DocId)numberOfTexts);
61 return false; // Empty texts are not indexed
64 uchar* TCImplementation::GetText(DocId k) const
66 assert(k < (DocId)numberOfTexts);
70 // Reserve average string length to avoid reallocs
71 result.reserve(n/numberOfTexts);
73 uchar c = alphabetrank->access(i);
77 i = C[c]+alphabetrank->rank(c,i)-1;
79 c = alphabetrank->access(i); // "next" char.
82 // Convert to uchar (FIXME return string?)
84 uchar* res = new uchar[i+1];
86 for (ulong j = 0; j < i; ++j)
87 res[i-j-1] = result[j];
92 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
94 assert(k < (DocId)numberOfTexts);
95 assert(j < (*textLength)[k]);
100 // Start position of k'th text
101 ulong start = (*textStartPos)[k];
103 return Substring(i + start, j-i+1);
106 /******************************************************************
107 * Existential queries
109 bool TCImplementation::IsPrefix(uchar const * pattern) const
111 TextPosition m = strlen((char *)pattern);
115 TextPosition sp = 0, ep = 0;
116 Search(pattern, m, &sp, &ep);
118 // Check for end-marker(s) in result interval
119 if (CountEndmarkers(sp, ep))
124 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
126 TextPosition m = strlen((char *)pattern);
130 TextPosition sp = 0, ep = 0;
131 Search(pattern, m, &sp, &ep);
133 // Check for end-marker(s) in result interval
134 if (CountEndmarkers(sp, ep, begin, end))
140 bool TCImplementation::IsSuffix(uchar const *pattern) const
142 // Here counting is as fast as IsSuffix():
143 if (CountSuffix(pattern) > 0)
148 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
150 // Here counting is as fast as IsSuffix():
151 if (CountSuffix(pattern, begin, end) > 0)
156 bool TCImplementation::IsEqual(uchar const *pattern) const
158 TextPosition m = std::strlen((char *)pattern);
160 return false; // No empty texts exists
162 TextPosition sp = 0, ep = 0;
163 // Match including end-marker
164 Search(pattern, m+1, &sp, &ep);
166 // Check for end-marker(s) in result interval
167 if (CountEndmarkers(sp, ep))
172 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
174 TextPosition m = std::strlen((char *)pattern);
176 return false; // No empty texts exists
178 TextPosition sp = 0, ep = 0;
179 // Match including end-marker
180 Search(pattern, m+1, &sp, &ep, begin, end);
182 // Check for end-marker(s) in result interval
183 if (CountEndmarkers(sp, ep))
188 bool TCImplementation::IsContains(uchar const * pattern) const
190 TextPosition m = strlen((char *)pattern);
194 TextPosition sp = 0, ep = 0;
195 // Just check if pattern exists somewhere
196 ulong count = Search(pattern, m, &sp, &ep);
203 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
205 // Here counting is as fast as existential querying
206 if (CountContains(pattern, begin, end) > 0)
207 return true; // FIXME No need to filter result set
211 bool TCImplementation::IsLessThan(uchar const * pattern) const
213 if (CountLessThan(pattern) > 0)
218 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
220 if (CountLessThan(pattern, begin, end) > 0)
225 /******************************************************************
228 ulong TCImplementation::Count(uchar const * pattern) const
230 TextPosition m = strlen((char *)pattern);
234 TextPosition sp = 0, ep = 0;
235 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
239 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
241 TextPosition m = strlen((char *)pattern);
243 return numberOfTexts;
245 TextPosition sp = 0, ep = 0;
246 Search(pattern, m, &sp, &ep);
248 // Count end-markers in result interval
249 return CountEndmarkers(sp, ep);
252 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
254 TextPosition m = strlen((char *)pattern);
256 return numberOfTexts;
258 TextPosition sp = 0, ep = 0;
259 Search(pattern, m, &sp, &ep);
261 // Count end-markers in result interval
262 return CountEndmarkers(sp, ep, begin, end);
265 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
267 TextPosition m = strlen((char *)pattern);
269 return numberOfTexts;
271 TextPosition sp = 0, ep = 0;
272 // Search with end-marker
273 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
278 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
280 TextPosition m = strlen((char *)pattern);
282 return numberOfTexts;
284 TextPosition sp = 0, ep = 0;
285 // Search with end-marker
286 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
291 unsigned TCImplementation::CountEqual(uchar const *pattern) const
293 TextPosition m = strlen((char const *)pattern);
295 return 0; // No empty texts.
297 TextPosition sp = 0, ep = 0;
298 // Match including end-marker
299 Search(pattern, m+1, &sp, &ep);
301 // Count end-markers in result interval
302 return CountEndmarkers(sp, ep);
305 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
307 TextPosition m = strlen((char const *)pattern);
309 return 0; // No empty texts.
311 TextPosition sp = 0, ep = 0;
312 // Match including end-marker
313 Search(pattern, m+1, &sp, &ep, begin, end);
315 // Count end-markers in result interval
316 return CountEndmarkers(sp, ep); // Already within [begin, end]
319 unsigned TCImplementation::CountContains(uchar const * pattern) const
321 TextPosition m = strlen((char const *)pattern);
323 return numberOfTexts; // Total number of texts.
325 // Here counting is as slow as fetching the result set
326 // because we have to filter out occ's that fall within same document.
327 TextCollection::document_result result = Contains(pattern);
328 return result.size();
331 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
333 TextPosition m = strlen((char const *)pattern);
335 return numberOfTexts; // Total number of texts.
337 // Here counting is as slow as fetching the result set
338 // because we have to filter out occ's that fall within same document.
339 TextCollection::document_result result = Contains(pattern, begin, end);
340 return result.size();
343 // Less than or equal
344 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
346 TextPosition m = strlen((char const *)pattern);
348 return 0; // No empty texts.
350 TextPosition sp = 0, ep = 0;
351 SearchLessThan(pattern, m, &sp, &ep);
353 // Count end-markers in result interval
354 return CountEndmarkers(sp, ep);
357 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
359 TextPosition m = strlen((char const *)pattern);
361 return 0; // No empty texts.
363 TextPosition sp = 0, ep = 0;
364 SearchLessThan(pattern, m, &sp, &ep);
366 // Count end-markers in result interval
367 return CountEndmarkers(sp, ep, begin, end);
371 * Document reporting queries
373 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
375 TextPosition m = strlen((char *)pattern);
377 return TextCollection::document_result(); // FIXME Should return all 1...k
379 TextPosition sp = 0, ep = 0;
380 Search(pattern, m, &sp, &ep);
382 // Iterate through end-markers in [sp,ep]:
383 return EnumerateEndmarkers(sp, ep);
386 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
388 TextPosition m = strlen((char *)pattern);
390 return TextCollection::document_result(); // FIXME Should return all 1...k
392 TextPosition sp = 0, ep = 0;
393 Search(pattern, m, &sp, &ep);
395 // Return end-markers in [sp,ep] and [begin, end]:
396 return EnumerateEndmarkers(sp, ep, begin, end);
399 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
401 TextPosition m = strlen((char *)pattern);
403 return TextCollection::document_result(); // FIXME Should return all 1...k
405 TextPosition sp = 0, ep = 0;
406 // Search with end-marker
407 Search(pattern, m+1, &sp, &ep);
409 TextCollection::document_result result;
410 result.reserve(ep-sp+1); // Try to avoid reallocation.
412 ulong sampled_rank_i = 0;
413 // Check each occurrence
414 for (; sp <= ep; ++sp)
418 uchar c = alphabetrank->access(i);
419 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
421 i = C[c]+alphabetrank->rank(c,i)-1;
422 c = alphabetrank->access(i);
424 // Assert: c == '\0' OR sampled->IsBitSet(i)
428 // Rank among the end-markers in BWT
429 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
430 result.push_back(Doc->access(endmarkerRank));
432 else // Sampled position
434 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
435 result.push_back(docId);
442 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
444 TextPosition m = strlen((char *)pattern);
446 return TextCollection::document_result(); // FIXME Should return all 1...k
448 TextPosition sp = 0, ep = 0;
449 // Search with end-marker
450 Search(pattern, m+1, &sp, &ep, begin, end);
452 TextCollection::document_result result;
453 result.reserve(ep-sp+1); // Try to avoid reallocation.
455 ulong sampled_rank_i = 0;
456 // Check each occurrence, already within [begin, end]
457 for (; sp <= ep; ++sp)
461 uchar c = alphabetrank->access(i);
462 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
464 i = C[c]+alphabetrank->rank(c,i)-1;
465 c = alphabetrank->access(i);
467 // Assert: c == '\0' OR sampled->IsBitSet(i)
471 // Rank among the end-markers in BWT
472 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
473 result.push_back(Doc->access(endmarkerRank));
475 else // Sampled position
477 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
478 result.push_back(docId);
486 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
488 TextPosition m = strlen((char const *)pattern);
490 return TextCollection::document_result(); // FIXME Should return all empty texts
492 TextPosition sp = 0, ep = 0;
493 // Match including end-marker
494 Search(pattern, m+1, &sp, &ep);
496 // Report end-markers in result interval
497 return EnumerateEndmarkers(sp, ep);
500 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
502 TextPosition m = strlen((char const *)pattern);
504 return TextCollection::document_result(); // FIXME Should return all empty texts
506 TextPosition sp = 0, ep = 0;
507 // Match including end-marker
508 Search(pattern, m+1, &sp, &ep, begin, end);
510 // Report end-markers in result interval
511 return EnumerateEndmarkers(sp, ep, begin, end);
515 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
517 TextPosition m = strlen((char *)pattern);
519 return TextCollection::document_result();
521 TextPosition sp = 0, ep = 0;
522 // Search all occurrences
523 Search(pattern, m, &sp, &ep);
525 // We want unique document indentifiers, using std::set to collect them
526 std::set<DocId> resultSet;
528 ulong sampled_rank_i = 0;
529 // Check each occurrence
530 for (; sp <= ep; ++sp)
533 uchar c = alphabetrank->access(i);
534 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
536 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
537 c = alphabetrank->access(i);
541 // Rank among the end-markers in BWT
542 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
543 resultSet.insert(Doc->access(endmarkerRank));
547 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
548 assert((unsigned)di < numberOfTexts);
549 resultSet.insert(di);
553 // Convert std::set to std::vector
554 TextCollection::document_result result(resultSet.begin(), resultSet.end());
558 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
560 TextPosition m = strlen((char *)pattern);
562 return TextCollection::document_result();
564 TextPosition sp = 0, ep = 0;
565 // Search all occurrences
566 Search(pattern, m, &sp, &ep);
568 // We want unique document indentifiers, using std::set to collect them
569 std::set<DocId> resultSet;
571 ulong sampled_rank_i = 0;
572 // Check each occurrence
573 for (; sp <= ep; ++sp)
576 uchar c = alphabetrank->access(i);
577 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
579 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
580 c = alphabetrank->access(i);
584 // Rank among the end-markers in BWT
585 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
586 DocId docId = Doc->access(endmarkerRank);
587 if (docId >= begin && docId <= end)
588 resultSet.insert(docId);
592 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
593 assert((unsigned)docId < numberOfTexts);
594 if (docId >= begin && docId <= end)
595 resultSet.insert(docId);
599 // Convert std::set to std::vector
600 TextCollection::document_result result(resultSet.begin(), resultSet.end());
604 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
606 TextPosition m = strlen((char *)pattern);
608 return TextCollection::document_result(); // empty result set
610 TextPosition sp = 0, ep = 0;
611 SearchLessThan(pattern, m, &sp, &ep);
613 // Report end-markers in result interval
614 return EnumerateEndmarkers(sp, ep);
617 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
619 TextPosition m = strlen((char *)pattern);
621 return TextCollection::document_result(); // empty result set
623 TextPosition sp = 0, ep = 0;
624 SearchLessThan(pattern, m, &sp, &ep);
626 // Iterate through end-markers in [sp,ep] and [begin, end]:
627 return EnumerateEndmarkers(sp, ep, begin, end);
631 * Full result set queries
633 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
635 TextPosition m = strlen((char *)pattern);
637 return full_result(); // FIXME Throw exception?
639 TextPosition sp = 0, ep = 0;
640 // Search all occurrences
641 Search(pattern, m, &sp, &ep);
644 result.reserve(ep-sp+1); // Try to avoid reallocation.
646 ulong sampled_rank_i = 0;
647 // Report each occurrence
648 for (; sp <= ep; ++sp)
651 TextPosition dist = 0;
652 uchar c = alphabetrank->access(i);
653 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
655 i = C[c]+alphabetrank->rank(c,i)-1;
656 c = alphabetrank->access(i);
661 // Rank among the end-markers in BWT
662 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
663 DocId docId = Doc->access(endmarkerRank);
664 result.push_back(make_pair(docId, dist));
668 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
669 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
670 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
672 result.push_back(make_pair(docId, textPos));
679 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
681 TextPosition m = strlen((char *)pattern);
683 return full_result(); // FIXME Throw exception?
685 TextPosition sp = 0, ep = 0;
686 // Search all occurrences
687 Search(pattern, m, &sp, &ep);
690 result.reserve(ep-sp+1); // Try to avoid reallocation.
692 ulong sampled_rank_i = 0;
693 // Report each occurrence
694 for (; sp <= ep; ++sp)
697 TextPosition dist = 0;
698 uchar c = alphabetrank->access(i);
699 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
701 i = C[c]+alphabetrank->rank(c,i)-1;
702 c = alphabetrank->access(i);
707 // Rank among the end-markers in BWT
708 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
710 // End-marker that we found belongs to the "preceeding" doc in collection:
711 DocId docId = Doc->access(endmarkerRank);
712 if (docId >= begin && docId <= end)
713 result.push_back(make_pair(docId, dist));
717 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
718 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
719 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
721 if (docId >= begin && docId <= end)
722 result.push_back(make_pair(docId, textPos));
731 * Save index to a file handle
733 * Throws a std::runtime_error exception on i/o error.
734 * First byte that is saved represents the version number of the save file.
735 * In version 2 files, the data fields are:
740 TextPosition bwtEndPos;
741 static_sequence * alphabetrank;
743 BlockArray *suffixes;
744 BlockArray *suffixDocId;
745 unsigned numberOfTexts;
747 static_sequence *docId;
749 void TCImplementation::Save(FILE *file) const
751 // Saving version info:
752 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
753 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
755 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
756 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
757 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
758 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
760 for(ulong i = 0; i < 256; ++i)
761 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
762 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
764 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
765 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
767 alphabetrank->save(file);
769 suffixes->Save(file);
770 suffixDocId->Save(file);
772 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
773 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
774 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
775 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
783 * Load index from a file handle
785 * Throws a std::runtime_error exception on i/o error.
786 * For more info, see TCImplementation::Save().
788 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
789 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
790 suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
793 if (std::fread(&verFlag, 1, 1, file) != 1)
794 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
795 if (verFlag != TCImplementation::versionFlag)
796 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
798 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
799 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
800 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
801 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
802 // FIXME samplerate can not be changed during load.
803 // if (this->samplerate == 0)
804 // this->samplerate = samplerate;
806 for(ulong i = 0; i < 256; ++i)
807 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
808 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
810 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
811 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
813 alphabetrank = static_sequence::load(file);
814 sampled = new BSGAP(file);
815 suffixes = new BlockArray(file);
816 suffixDocId = new BlockArray(file);
818 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
819 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
820 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
821 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
823 Doc = static_sequence::load(file);
825 // FIXME Construct data structures with new samplerate
832 * Rest of the functions follow...
837 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
839 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
840 int c = (int)pattern[m-1];
842 TextPosition sp = C[c];
843 TextPosition ep = C[c+1]-1;
844 while (sp<=ep && i>=1)
846 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
847 c = (int)pattern[--i];
848 sp = C[c]+alphabetrank->rank(c,sp-1);
849 ep = C[c]+alphabetrank->rank(c,ep)-1;
859 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
861 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
862 int c = (int)pattern[m-1];
863 assert(c == 0); // Start from endmarkers
865 TextPosition sp = begin;
866 TextPosition ep = end;
867 while (sp<=ep && i>=1)
869 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
870 c = (int)pattern[--i];
871 sp = C[c]+alphabetrank->rank(c,sp-1);
872 ep = C[c]+alphabetrank->rank(c,ep)-1;
883 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
885 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
886 uint c = (int)pattern[m-1];
889 TextPosition ep = C[c+1]-1;
890 while (sp<=ep && i>=1)
892 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
893 c = (int)pattern[--i];
894 uint result = alphabetrank->rankLessThan(c,ep);
909 TCImplementation::~TCImplementation() {
917 void TCImplementation::makewavelet(uchar *bwt)
926 if (C[i]>0) {min = i; break;}
927 for (i=255;i>=min;--i)
928 if (C[i]>0) {max = i; break;}
930 ulong prev=C[0], temp;
932 for (i=1;i<256;i++) {
937 // this->codetable = node::makecodetable(bwt,n);
938 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
940 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
941 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
943 // std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
944 // HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
946 alphabet_mapper * am = new alphabet_mapper_none();
947 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
948 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
949 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
951 bwt = 0; // already deleted
953 // std::cerr << "heap usage after WT: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
954 // std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
957 void TCImplementation::maketables()
959 // Calculate BWT end-marker position (of last inserted text)
962 uint alphabetrank_i_tmp = 0;
963 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
966 i = C[c]+alphabetrank_i_tmp-1;
967 c = alphabetrank->access(i, alphabetrank_i_tmp);
973 // Build up arrays for text length and starting positions
974 // FIXME Temp, remove
975 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
976 (*textStartPos)[0] = 0;
979 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
980 unsigned ceilLog2n = Tools::CeilLog2(n);
981 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
982 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
984 // Mapping from end-markers to doc ID's:
985 BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
987 ulong *sampledpositions = new ulong[n/W+1];
988 for (ulong i=0;i<n/W+1;i++)
989 sampledpositions[i]=0lu;
992 ulong sampleCount = 0;
993 // Keeping track of text position of prev. end-marker seen
994 ulong posOfSuccEndmarker = n;
995 DocId textId = numberOfTexts;
998 uint alphabetrank_i_tmp =0;
1000 for (ulong i=n-1;i<ulongmax;i--) {
1001 // i substitutes SA->GetPos(i)
1004 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1005 Tools::SetField(sampledpositions,1,p,1);
1006 (*positions)[sampleCount] = p;
1007 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1011 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1016 // Record the order of end-markers in BWT:
1017 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1018 (*endmarkerDocId)[endmarkerRank] = textId;
1020 // Store text length and text start position:
1021 if (textId < (DocId)numberOfTexts - 1)
1023 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1024 posOfSuccEndmarker = x;
1027 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1028 p = textId; // Correct LF-mapping to the last char of the previous text.
1030 else // Now c != '\0', do LF-mapping:
1031 p = C[c]+alphabetrank_i_tmp-1;
1033 assert(textId == 0);
1035 sampled = new BSGAP(sampledpositions,n,true);
1036 sampleLength = sampled->rank(n-1);
1037 assert(sampleCount == sampleLength);
1039 // Suffixes store an offset from the text start position
1040 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1041 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1043 for(ulong i=0; i<sampleLength; i++) {
1044 assert((*positions)[i] < n);
1045 ulong j = sampled->rank((*positions)[i]);
1046 if (j==0) j=sampleLength;
1047 TextPosition textPos = (*tmpSuffix)[i];
1048 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1050 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1051 assert((*suffixDocId)[j-1] < numberOfTexts);
1052 // calculate offset from text start:
1053 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1055 // FIXME Temp, remove
1058 delete textStartPos;
1060 // std::cerr << "heap usage before Doc: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1063 uint *tmp = new uint[numberOfTexts]; // FIXME Silly...
1064 for (unsigned i = 0; i < numberOfTexts; ++i)
1066 tmp[i] = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1067 // cout << tmp[i] << ",";
1070 delete endmarkerDocId;
1072 alphabet_mapper * am = new alphabet_mapper_none();
1073 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1074 Doc = new static_sequence_wvtree_noptrs(tmp, numberOfTexts, bmb, am);
1078 // std::cerr << "heap usage after Doc: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1080 /*document_result res = Doc->access(3, 3, 0, 3);
1081 cout << "Access result: ";
1082 for (document_result::iterator it = res.begin(); it != res.end(); ++it)
1083 cout << *it << ", ";
1090 * Finds document identifier for given text position
1092 * Starting text position of the document is stored into second parameter.
1093 * Binary searching on text starting positions.
1095 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1100 DocId b = numberOfTexts - 1;
1103 DocId c = a + (b - a)/2;
1104 if ((*textStartPos)[c] > i)
1106 else if ((*textStartPos)[c+1] > i)
1112 assert(a < (DocId)numberOfTexts);
1113 assert(i >= (*textStartPos)[a]);
1114 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));