1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
22 //#include "HeapProfiler.h" // FIXME remove
31 #include <cstring> // For strlen()
40 // Save file version info
41 const uchar TCImplementation::versionFlag = 4;
44 * Constructor inits an empty dynamic FM-index.
45 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
47 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
48 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
49 suffixDocId(0), numberOfTexts(numberOfTexts_), maxTextLength(maxTextLength_), Doc(0)
51 makewavelet(bwt); // Deletes bwt!
54 // Make sampling tables
58 bool TCImplementation::EmptyText(DocId k) const
60 assert(k < (DocId)numberOfTexts);
61 return false; // Empty texts are not indexed
64 uchar* TCImplementation::GetText(DocId k) const
66 assert(k < (DocId)numberOfTexts);
70 // Reserve average string length to avoid reallocs
71 result.reserve(n/numberOfTexts);
73 uchar c = alphabetrank->access(i);
77 i = C[c]+alphabetrank->rank(c,i)-1;
79 c = alphabetrank->access(i); // "next" char.
82 // Convert to uchar (FIXME return string?)
84 uchar* res = new uchar[i+1];
86 for (ulong j = 0; j < i; ++j)
87 res[i-j-1] = result[j];
92 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
94 assert(k < (DocId)numberOfTexts);
95 assert(j < (*textLength)[k]);
100 // Start position of k'th text
101 ulong start = (*textStartPos)[k];
103 return Substring(i + start, j-i+1);
106 /******************************************************************
107 * Existential queries
109 bool TCImplementation::IsPrefix(uchar const * pattern) const
111 TextPosition m = strlen((char *)pattern);
115 TextPosition sp = 0, ep = 0;
116 Search(pattern, m, &sp, &ep);
118 // Check for end-marker(s) in result interval
119 if (CountEndmarkers(sp, ep))
124 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
126 TextPosition m = strlen((char *)pattern);
130 TextPosition sp = 0, ep = 0;
131 Search(pattern, m, &sp, &ep);
133 // Check for end-marker(s) in result interval
134 if (CountEndmarkers(sp, ep, begin, end))
140 bool TCImplementation::IsSuffix(uchar const *pattern) const
142 // Here counting is as fast as IsSuffix():
143 if (CountSuffix(pattern) > 0)
148 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
150 // Here counting is as fast as IsSuffix():
151 if (CountSuffix(pattern, begin, end) > 0)
156 bool TCImplementation::IsEqual(uchar const *pattern) const
158 TextPosition m = std::strlen((char *)pattern);
160 return false; // No empty texts exists
162 TextPosition sp = 0, ep = 0;
163 // Match including end-marker
164 Search(pattern, m+1, &sp, &ep);
166 // Check for end-marker(s) in result interval
167 if (CountEndmarkers(sp, ep))
172 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
174 TextPosition m = std::strlen((char *)pattern);
176 return false; // No empty texts exists
178 TextPosition sp = 0, ep = 0;
179 // Match including end-marker
180 Search(pattern, m+1, &sp, &ep, begin, end);
182 // Check for end-marker(s) in result interval
183 if (CountEndmarkers(sp, ep))
188 bool TCImplementation::IsContains(uchar const * pattern) const
190 TextPosition m = strlen((char *)pattern);
194 TextPosition sp = 0, ep = 0;
195 // Just check if pattern exists somewhere
196 ulong count = Search(pattern, m, &sp, &ep);
203 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
205 // Here counting is as fast as existential querying
206 if (CountContains(pattern, begin, end) > 0)
207 return true; // FIXME No need to filter result set
211 bool TCImplementation::IsLessThan(uchar const * pattern) const
213 if (CountLessThan(pattern) > 0)
218 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
220 if (CountLessThan(pattern, begin, end) > 0)
225 /******************************************************************
228 ulong TCImplementation::Count(uchar const * pattern) const
230 TextPosition m = strlen((char *)pattern);
234 TextPosition sp = 0, ep = 0;
235 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
239 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
241 TextPosition m = strlen((char *)pattern);
243 return numberOfTexts;
245 TextPosition sp = 0, ep = 0;
246 Search(pattern, m, &sp, &ep);
248 // Count end-markers in result interval
249 return CountEndmarkers(sp, ep);
252 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
254 TextPosition m = strlen((char *)pattern);
256 return numberOfTexts;
258 TextPosition sp = 0, ep = 0;
259 Search(pattern, m, &sp, &ep);
261 // Count end-markers in result interval
262 return CountEndmarkers(sp, ep, begin, end);
265 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
267 TextPosition m = strlen((char *)pattern);
269 return numberOfTexts;
271 TextPosition sp = 0, ep = 0;
272 // Search with end-marker
273 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
278 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
280 TextPosition m = strlen((char *)pattern);
282 return numberOfTexts;
284 TextPosition sp = 0, ep = 0;
285 // Search with end-marker
286 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
291 unsigned TCImplementation::CountEqual(uchar const *pattern) const
293 TextPosition m = strlen((char const *)pattern);
295 return 0; // No empty texts.
297 TextPosition sp = 0, ep = 0;
298 // Match including end-marker
299 Search(pattern, m+1, &sp, &ep);
301 // Count end-markers in result interval
302 return CountEndmarkers(sp, ep);
305 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
307 TextPosition m = strlen((char const *)pattern);
309 return 0; // No empty texts.
311 TextPosition sp = 0, ep = 0;
312 // Match including end-marker
313 Search(pattern, m+1, &sp, &ep, begin, end);
315 // Count end-markers in result interval
316 return CountEndmarkers(sp, ep); // Already within [begin, end]
319 unsigned TCImplementation::CountContains(uchar const * pattern) const
321 TextPosition m = strlen((char const *)pattern);
323 return numberOfTexts; // Total number of texts.
325 // Here counting is as slow as fetching the result set
326 // because we have to filter out occ's that fall within same document.
327 TextCollection::document_result result = Contains(pattern);
328 return result.size();
331 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
333 TextPosition m = strlen((char const *)pattern);
335 return numberOfTexts; // Total number of texts.
337 // Here counting is as slow as fetching the result set
338 // because we have to filter out occ's that fall within same document.
339 TextCollection::document_result result = Contains(pattern, begin, end);
340 return result.size();
343 // Less than or equal
344 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
346 TextPosition m = strlen((char const *)pattern);
348 return 0; // No empty texts.
350 TextPosition sp = 0, ep = 0;
351 SearchLessThan(pattern, m, &sp, &ep);
353 // Count end-markers in result interval
354 return CountEndmarkers(sp, ep);
357 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
359 TextPosition m = strlen((char const *)pattern);
361 return 0; // No empty texts.
363 TextPosition sp = 0, ep = 0;
364 SearchLessThan(pattern, m, &sp, &ep);
366 // Count end-markers in result interval
367 return CountEndmarkers(sp, ep, begin, end);
371 * Document reporting queries
373 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
375 TextPosition m = strlen((char *)pattern);
377 return TextCollection::document_result(); // FIXME Should return all 1...k
379 TextPosition sp = 0, ep = 0;
380 Search(pattern, m, &sp, &ep);
382 TextCollection::document_result result;
384 // Report end-markers in result interval
385 unsigned resultSize = CountEndmarkers(sp, ep);
389 result.reserve(resultSize); // Try to avoid reallocation.
391 // Iterate through end-markers in [sp,ep]:
392 return EnumerateEndmarkers(sp, ep);
395 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
397 TextPosition m = strlen((char *)pattern);
399 return TextCollection::document_result(); // FIXME Should return all 1...k
401 TextPosition sp = 0, ep = 0;
402 Search(pattern, m, &sp, &ep);
404 TextCollection::document_result result;
406 // Report end-markers in result interval
407 unsigned resultSize = CountEndmarkers(sp, ep);
411 result.reserve(resultSize); // Try to avoid reallocation.
413 // Return end-markers in [sp,ep] and [begin, end]:
414 return EnumerateEndmarkers(sp, ep, begin, end);
417 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
419 TextPosition m = strlen((char *)pattern);
421 return TextCollection::document_result(); // FIXME Should return all 1...k
423 TextPosition sp = 0, ep = 0;
424 // Search with end-marker
425 Search(pattern, m+1, &sp, &ep);
427 TextCollection::document_result result;
428 result.reserve(ep-sp+1); // Try to avoid reallocation.
430 ulong sampled_rank_i = 0;
431 // Check each occurrence
432 for (; sp <= ep; ++sp)
436 uchar c = alphabetrank->access(i);
437 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
439 i = C[c]+alphabetrank->rank(c,i)-1;
440 c = alphabetrank->access(i);
442 // Assert: c == '\0' OR sampled->IsBitSet(i)
446 // Rank among the end-markers in BWT
447 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
448 result.push_back(Doc->access(endmarkerRank));
450 else // Sampled position
452 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
453 result.push_back(docId);
460 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
462 TextPosition m = strlen((char *)pattern);
464 return TextCollection::document_result(); // FIXME Should return all 1...k
466 TextPosition sp = 0, ep = 0;
467 // Search with end-marker
468 Search(pattern, m+1, &sp, &ep, begin, end);
470 TextCollection::document_result result;
471 result.reserve(ep-sp+1); // Try to avoid reallocation.
473 ulong sampled_rank_i = 0;
474 // Check each occurrence, already within [begin, end]
475 for (; sp <= ep; ++sp)
479 uchar c = alphabetrank->access(i);
480 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
482 i = C[c]+alphabetrank->rank(c,i)-1;
483 c = alphabetrank->access(i);
485 // Assert: c == '\0' OR sampled->IsBitSet(i)
489 // Rank among the end-markers in BWT
490 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
491 result.push_back(Doc->access(endmarkerRank));
493 else // Sampled position
495 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
496 result.push_back(docId);
504 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
506 TextPosition m = strlen((char const *)pattern);
508 return TextCollection::document_result(); // FIXME Should return all empty texts
510 TextPosition sp = 0, ep = 0;
511 // Match including end-marker
512 Search(pattern, m+1, &sp, &ep);
514 TextCollection::document_result result;
516 // Report end-markers in result interval
517 return EnumerateEndmarkers(sp, ep);
520 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
522 TextPosition m = strlen((char const *)pattern);
524 return TextCollection::document_result(); // FIXME Should return all empty texts
526 TextPosition sp = 0, ep = 0;
527 // Match including end-marker
528 Search(pattern, m+1, &sp, &ep, begin, end);
530 TextCollection::document_result result;
532 // Report end-markers in result interval
533 return EnumerateEndmarkers(sp, ep, begin, end);
537 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
539 TextPosition m = strlen((char *)pattern);
541 return TextCollection::document_result();
543 TextPosition sp = 0, ep = 0;
544 // Search all occurrences
545 Search(pattern, m, &sp, &ep);
547 // We want unique document indentifiers, using std::set to collect them
548 std::set<DocId> resultSet;
550 ulong sampled_rank_i = 0;
551 // Check each occurrence
552 for (; sp <= ep; ++sp)
555 uchar c = alphabetrank->access(i);
556 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
558 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
559 c = alphabetrank->access(i);
563 // Rank among the end-markers in BWT
564 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
565 resultSet.insert(Doc->access(endmarkerRank));
569 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
570 assert((unsigned)di < numberOfTexts);
571 resultSet.insert(di);
575 // Convert std::set to std::vector
576 TextCollection::document_result result(resultSet.begin(), resultSet.end());
580 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
582 TextPosition m = strlen((char *)pattern);
584 return TextCollection::document_result();
586 TextPosition sp = 0, ep = 0;
587 // Search all occurrences
588 Search(pattern, m, &sp, &ep);
590 // We want unique document indentifiers, using std::set to collect them
591 std::set<DocId> resultSet;
593 ulong sampled_rank_i = 0;
594 // Check each occurrence
595 for (; sp <= ep; ++sp)
598 uchar c = alphabetrank->access(i);
599 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
601 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
602 c = alphabetrank->access(i);
606 // Rank among the end-markers in BWT
607 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
608 DocId docId = Doc->access(endmarkerRank);
609 if (docId >= begin && docId <= end)
610 resultSet.insert(docId);
614 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
615 assert((unsigned)docId < numberOfTexts);
616 if (docId >= begin && docId <= end)
617 resultSet.insert(docId);
621 // Convert std::set to std::vector
622 TextCollection::document_result result(resultSet.begin(), resultSet.end());
626 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
628 TextPosition m = strlen((char *)pattern);
630 return TextCollection::document_result(); // empty result set
632 TextPosition sp = 0, ep = 0;
633 SearchLessThan(pattern, m, &sp, &ep);
635 TextCollection::document_result result;
637 // Report end-markers in result interval
638 return EnumerateEndmarkers(sp, ep);
641 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
643 TextPosition m = strlen((char *)pattern);
645 return TextCollection::document_result(); // empty result set
647 TextPosition sp = 0, ep = 0;
648 SearchLessThan(pattern, m, &sp, &ep);
650 TextCollection::document_result result;
652 // Report end-markers in result interval
653 unsigned resultSize = CountEndmarkers(sp, ep);
657 result.reserve(resultSize); // Try to avoid reallocation.
659 // Iterate through end-markers in [sp,ep] and [begin, end]:
660 return EnumerateEndmarkers(sp, ep, begin, end);
664 * Full result set queries
666 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
668 TextPosition m = strlen((char *)pattern);
670 return full_result(); // FIXME Throw exception?
672 TextPosition sp = 0, ep = 0;
673 // Search all occurrences
674 Search(pattern, m, &sp, &ep);
677 result.reserve(ep-sp+1); // Try to avoid reallocation.
679 ulong sampled_rank_i = 0;
680 // Report each occurrence
681 for (; sp <= ep; ++sp)
684 TextPosition dist = 0;
685 uchar c = alphabetrank->access(i);
686 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
688 i = C[c]+alphabetrank->rank(c,i)-1;
689 c = alphabetrank->access(i);
694 // Rank among the end-markers in BWT
695 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
696 DocId docId = Doc->access(endmarkerRank);
697 result.push_back(make_pair(docId, dist));
701 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
702 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
703 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
705 result.push_back(make_pair(docId, textPos));
712 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
714 TextPosition m = strlen((char *)pattern);
716 return full_result(); // FIXME Throw exception?
718 TextPosition sp = 0, ep = 0;
719 // Search all occurrences
720 Search(pattern, m, &sp, &ep);
723 result.reserve(ep-sp+1); // Try to avoid reallocation.
725 ulong sampled_rank_i = 0;
726 // Report each occurrence
727 for (; sp <= ep; ++sp)
730 TextPosition dist = 0;
731 uchar c = alphabetrank->access(i);
732 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
734 i = C[c]+alphabetrank->rank(c,i)-1;
735 c = alphabetrank->access(i);
740 // Rank among the end-markers in BWT
741 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
743 // End-marker that we found belongs to the "preceeding" doc in collection:
744 DocId docId = Doc->access(endmarkerRank);
745 if (docId >= begin && docId <= end)
746 result.push_back(make_pair(docId, dist));
750 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
751 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
752 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
754 if (docId >= begin && docId <= end)
755 result.push_back(make_pair(docId, textPos));
764 * Save index to a file handle
766 * Throws a std::runtime_error exception on i/o error.
767 * First byte that is saved represents the version number of the save file.
768 * In version 2 files, the data fields are:
773 TextPosition bwtEndPos;
774 static_sequence * alphabetrank;
776 BlockArray *suffixes;
777 BlockArray *suffixDocId;
778 unsigned numberOfTexts;
780 static_sequence *docId;
782 void TCImplementation::Save(FILE *file) const
784 // Saving version info:
785 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
786 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
788 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
789 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
790 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
791 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
793 for(ulong i = 0; i < 256; ++i)
794 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
795 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
797 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
798 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
800 alphabetrank->save(file);
802 suffixes->Save(file);
803 suffixDocId->Save(file);
805 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
806 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
807 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
808 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
816 * Load index from a file handle
818 * Throws a std::runtime_error exception on i/o error.
819 * For more info, see TCImplementation::Save().
821 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
822 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
823 suffixDocId(0), numberOfTexts(0), maxTextLength(0), Doc(0)
826 if (std::fread(&verFlag, 1, 1, file) != 1)
827 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
828 if (verFlag != TCImplementation::versionFlag)
829 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
831 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
832 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
833 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
834 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
835 // FIXME samplerate can not be changed during load.
836 // if (this->samplerate == 0)
837 // this->samplerate = samplerate;
839 for(ulong i = 0; i < 256; ++i)
840 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
841 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
843 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
844 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
846 alphabetrank = static_sequence::load(file);
847 sampled = new BSGAP(file);
848 suffixes = new BlockArray(file);
849 suffixDocId = new BlockArray(file);
851 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
852 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
853 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
854 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
856 Doc = static_sequence::load(file);
858 // FIXME Construct data structures with new samplerate
865 * Rest of the functions follow...
870 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
872 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
873 int c = (int)pattern[m-1];
875 TextPosition sp = C[c];
876 TextPosition ep = C[c+1]-1;
877 while (sp<=ep && i>=1)
879 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
880 c = (int)pattern[--i];
881 sp = C[c]+alphabetrank->rank(c,sp-1);
882 ep = C[c]+alphabetrank->rank(c,ep)-1;
892 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
894 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
895 int c = (int)pattern[m-1];
896 assert(c == 0); // Start from endmarkers
898 TextPosition sp = begin;
899 TextPosition ep = end;
900 while (sp<=ep && i>=1)
902 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
903 c = (int)pattern[--i];
904 sp = C[c]+alphabetrank->rank(c,sp-1);
905 ep = C[c]+alphabetrank->rank(c,ep)-1;
916 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
918 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
919 uint c = (int)pattern[m-1];
922 TextPosition ep = C[c+1]-1;
923 while (sp<=ep && i>=1)
925 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
926 c = (int)pattern[--i];
927 uint result = alphabetrank->rank(c,ep);
942 TCImplementation::~TCImplementation() {
950 void TCImplementation::makewavelet(uchar *bwt)
959 if (C[i]>0) {min = i; break;}
960 for (i=255;i>=min;--i)
961 if (C[i]>0) {max = i; break;}
963 ulong prev=C[0], temp;
965 for (i=1;i<256;i++) {
970 // this->codetable = node::makecodetable(bwt,n);
971 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
973 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
974 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
975 // std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
977 // HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
979 alphabet_mapper * am = new alphabet_mapper_none();
980 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
981 // static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
982 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
983 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
985 bwt = 0; // already deleted
987 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
988 // std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
991 void TCImplementation::maketables()
993 // Calculate BWT end-marker position (of last inserted text)
996 uint alphabetrank_i_tmp = 0;
997 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
1000 i = C[c]+alphabetrank_i_tmp-1;
1001 c = alphabetrank->access(i, alphabetrank_i_tmp);
1004 this->bwtEndPos = i;
1007 // Build up arrays for text length and starting positions
1008 // FIXME Temp, remove
1009 //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1010 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1011 //(*textLength)[0] = l;
1012 (*textStartPos)[0] = 0;
1014 // Construct samples
1015 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1016 unsigned ceilLog2n = Tools::CeilLog2(n);
1017 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1018 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1020 // Mapping from end-markers to doc ID's:
1021 BlockArray *endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1023 ulong *sampledpositions = new ulong[n/W+1];
1024 for (ulong i=0;i<n/W+1;i++)
1025 sampledpositions[i]=0lu;
1027 ulong x,p=bwtEndPos;
1028 ulong sampleCount = 0;
1029 // Keeping track of text position of prev. end-marker seen
1030 ulong posOfSuccEndmarker = n;
1031 DocId textId = numberOfTexts;
1034 uint alphabetrank_i_tmp =0;
1037 for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1038 // i substitutes SA->GetPos(i)
1041 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1042 Tools::SetField(sampledpositions,1,p,1);
1043 (*positions)[sampleCount] = p;
1044 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1048 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1053 // Record the order of end-markers in BWT:
1054 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1055 (*endmarkerDocId)[endmarkerRank] = textId;
1057 // Store text length and text start position:
1058 if (textId < (DocId)numberOfTexts - 1)
1060 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1061 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1062 posOfSuccEndmarker = x;
1065 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1066 p = textId; // Correct LF-mapping to the last char of the previous text.
1069 p = C[c]+alphabetrank_i_tmp-1;
1071 assert(textId == 0);
1073 sampled = new BSGAP(sampledpositions,n,true);
1074 sampleLength = sampled->rank(n-1);
1075 assert(sampleCount == sampleLength);
1077 // Suffixes store an offset from the text start position
1078 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1079 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1081 for(ulong i=0; i<sampleLength; i++) {
1082 assert((*positions)[i] < n);
1083 ulong j = sampled->rank((*positions)[i]);
1084 if (j==0) j=sampleLength;
1085 TextPosition textPos = (*tmpSuffix)[i];
1086 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1088 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1089 assert((*suffixDocId)[j-1] < numberOfTexts);
1090 // calculate offset from text start:
1091 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1093 // FIXME Temp, remove
1096 // delete textLength;
1097 delete textStartPos;
1100 uint *tmp = new uint[numberOfTexts]; // FIXME Silly...
1102 for (unsigned i = 0; i < numberOfTexts; ++i)
1104 tmp[i] = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1105 // cout << tmp[i] << ", ";
1108 delete endmarkerDocId;
1109 alphabet_mapper * am = new alphabet_mapper_none();
1110 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1111 wt_coder * wtc = new wt_coder_binary(tmp, numberOfTexts, am);
1112 Doc = new static_sequence_wvtree(tmp, numberOfTexts, wtc, bmb, am);
1116 /* document_result res = Doc->access(1, 2, 0, 1);
1118 for (document_result::iterator it = res.begin(); it != res.end(); ++it)
1119 cout << *it << ", ";
1125 * Finds document identifier for given text position
1127 * Starting text position of the document is stored into second parameter.
1128 * Binary searching on text starting positions.
1130 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1135 DocId b = numberOfTexts - 1;
1138 DocId c = a + (b - a)/2;
1139 if ((*textStartPos)[c] > i)
1141 else if ((*textStartPos)[c+1] > i)
1147 assert(a < (DocId)numberOfTexts);
1148 assert(i >= (*textStartPos)[a]);
1149 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));