1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
20 #include "TCImplementation.h"
22 //#include "HeapProfiler.h" // FIXME remove
31 #include <cstring> // For strlen()
40 // Save file version info
41 const uchar TCImplementation::versionFlag = 3;
44 * Constructor inits an empty dynamic FM-index.
45 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
47 TCImplementation::TCImplementation(uchar * bwt, ulong length, unsigned samplerate_, unsigned numberOfTexts_, ulong maxTextLength_)
48 : n(length), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
49 suffixDocId(0), numberOfTexts(numberOfTexts_), numberOfAllTexts(numberOfTexts_), maxTextLength(maxTextLength_), endmarkerDocId(0),
53 // Bitvector of empty texts, FIXME Remove?
55 //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
56 ulong *tempB = new ulong[numberOfTexts/W + 1];
57 for (ulong i = 0; i < numberOfTexts/W + 1; ++i)
59 // for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
60 // Tools::SetField(tempB, 1, (*it), 1);
61 emptyTextRank = new BSGAP(tempB, numberOfTexts, true);
63 makewavelet(bwt); // Deletes bwt!
66 // Make sampling tables
70 bool TCImplementation::EmptyText(DocId k) const
72 assert(k < (DocId)numberOfTexts);
73 if (emptyTextRank->IsBitSet(k))
78 uchar* TCImplementation::GetText(DocId k) const
80 assert(k < (DocId)numberOfTexts);
83 if (emptyTextRank->IsBitSet(k, &textRank))
85 uchar* result = new uchar[1];
89 // Map to non-empty text
90 k -= textRank; //emptyTextRank->rank(k);
95 // Reserve average string length to avoid reallocs
96 result.reserve(n/numberOfTexts);
98 uchar c = alphabetrank->access(i);
102 i = C[c]+alphabetrank->rank(c,i)-1;
104 c = alphabetrank->access(i); // "next" char.
107 // Convert to uchar (FIXME return string?)
109 uchar* res = new uchar[i+1];
111 for (ulong j = 0; j < i; ++j)
112 res[i-j-1] = result[j];
117 uchar* TCImplementation::GetText(DocId k, TextPosition i, TextPosition j) const
119 assert(k < (DocId)numberOfTexts);
120 assert(j < (*textLength)[k]);
124 if (emptyTextRank->IsBitSet(k, &textRank))
126 uchar* result = new uchar[1];
131 // Map to non-empty text
132 k -= textRank; // emptyTextRank->rank(k);
134 // Start position of k'th text
135 ulong start = (*textStartPos)[k];
137 return Substring(i + start, j-i+1);
140 /******************************************************************
141 * Existential queries
143 bool TCImplementation::IsPrefix(uchar const * pattern) const
145 TextPosition m = strlen((char *)pattern);
149 TextPosition sp = 0, ep = 0;
150 Search(pattern, m, &sp, &ep);
152 // Check for end-marker(s) in result interval
153 if (CountEndmarkers(sp, ep))
158 bool TCImplementation::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
160 TextPosition m = strlen((char *)pattern);
164 TextPosition sp = 0, ep = 0;
165 Search(pattern, m, &sp, &ep);
167 // Check for end-marker(s) in result interval
168 if (CountEndmarkers(sp, ep, begin, end))
174 bool TCImplementation::IsSuffix(uchar const *pattern) const
176 // Here counting is as fast as IsSuffix():
177 if (CountSuffix(pattern) > 0)
182 bool TCImplementation::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
184 // Here counting is as fast as IsSuffix():
185 if (CountSuffix(pattern, begin, end) > 0)
190 bool TCImplementation::IsEqual(uchar const *pattern) const
192 TextPosition m = std::strlen((char *)pattern);
195 if (numberOfAllTexts - numberOfTexts > 0u)
196 return true; // Empty texts exists
197 return false; // No empty texts exists
200 TextPosition sp = 0, ep = 0;
201 // Match including end-marker
202 Search(pattern, m+1, &sp, &ep);
204 // Check for end-marker(s) in result interval
205 if (CountEndmarkers(sp, ep))
210 bool TCImplementation::IsEqual(uchar const *pattern, DocId begin, DocId end) const
212 TextPosition m = std::strlen((char *)pattern);
215 if (numberOfAllTexts - numberOfTexts > 0u)
216 return true; // Empty texts exists
217 return false; // No empty texts exists
220 TextPosition sp = 0, ep = 0;
221 // Match including end-marker
222 Search(pattern, m+1, &sp, &ep, begin, end);
224 // Check for end-marker(s) in result interval
225 if (CountEndmarkers(sp, ep))
230 bool TCImplementation::IsContains(uchar const * pattern) const
232 TextPosition m = strlen((char *)pattern);
236 TextPosition sp = 0, ep = 0;
237 // Just check if pattern exists somewhere
238 ulong count = Search(pattern, m, &sp, &ep);
245 bool TCImplementation::IsContains(uchar const * pattern, DocId begin, DocId end) const
247 // Here counting is as fast as existential querying
248 if (CountContains(pattern, begin, end) > 0)
249 return true; // FIXME No need to filter result set
253 bool TCImplementation::IsLessThan(uchar const * pattern) const
255 if (CountLessThan(pattern) > 0)
260 bool TCImplementation::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
262 if (CountLessThan(pattern, begin, end) > 0)
267 /******************************************************************
270 ulong TCImplementation::Count(uchar const * pattern) const
272 TextPosition m = strlen((char *)pattern);
276 TextPosition sp = 0, ep = 0;
277 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
281 unsigned TCImplementation::CountPrefix(uchar const * pattern) const
283 TextPosition m = strlen((char *)pattern);
285 return numberOfAllTexts;
287 TextPosition sp = 0, ep = 0;
288 Search(pattern, m, &sp, &ep);
290 // Count end-markers in result interval
291 return CountEndmarkers(sp, ep);
294 unsigned TCImplementation::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
296 TextPosition m = strlen((char *)pattern);
298 return numberOfAllTexts;
300 TextPosition sp = 0, ep = 0;
301 Search(pattern, m, &sp, &ep);
303 // Count end-markers in result interval
304 return CountEndmarkers(sp, ep, begin, end);
307 unsigned TCImplementation::CountSuffix(uchar const * pattern) const
309 TextPosition m = strlen((char *)pattern);
311 return numberOfAllTexts;
313 TextPosition sp = 0, ep = 0;
314 // Search with end-marker
315 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
320 unsigned TCImplementation::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
322 TextPosition m = strlen((char *)pattern);
324 return numberOfAllTexts;
326 TextPosition sp = 0, ep = 0;
327 // Search with end-marker
328 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
333 unsigned TCImplementation::CountEqual(uchar const *pattern) const
335 TextPosition m = strlen((char const *)pattern);
337 return numberOfAllTexts - numberOfTexts; // Empty texts.
339 TextPosition sp = 0, ep = 0;
340 // Match including end-marker
341 Search(pattern, m+1, &sp, &ep);
343 // Count end-markers in result interval
344 return CountEndmarkers(sp, ep);
347 unsigned TCImplementation::CountEqual(uchar const *pattern, DocId begin, DocId end) const
349 TextPosition m = strlen((char const *)pattern);
351 return numberOfAllTexts - numberOfTexts; // Empty texts.
353 TextPosition sp = 0, ep = 0;
354 // Match including end-marker
355 Search(pattern, m+1, &sp, &ep, begin, end);
357 // Count end-markers in result interval
358 return CountEndmarkers(sp, ep); // Already within [begin, end]
361 unsigned TCImplementation::CountContains(uchar const * pattern) const
363 TextPosition m = strlen((char const *)pattern);
365 return numberOfAllTexts; // Total number of texts.
367 // Here counting is as slow as fetching the result set
368 // because we have to filter out occ's that fall within same document.
369 TextCollection::document_result result = Contains(pattern);
370 return result.size();
373 unsigned TCImplementation::CountContains(uchar const * pattern, DocId begin, DocId end) const
375 TextPosition m = strlen((char const *)pattern);
377 return numberOfAllTexts; // Total number of texts.
379 // Here counting is as slow as fetching the result set
380 // because we have to filter out occ's that fall within same document.
381 TextCollection::document_result result = Contains(pattern, begin, end);
382 return result.size();
385 // Less than or equal
386 unsigned TCImplementation::CountLessThan(uchar const * pattern) const
388 TextPosition m = strlen((char const *)pattern);
390 return numberOfAllTexts - numberOfTexts; // Empty texts.
392 TextPosition sp = 0, ep = 0;
393 SearchLessThan(pattern, m, &sp, &ep);
395 // Count end-markers in result interval
396 return CountEndmarkers(sp, ep);
399 unsigned TCImplementation::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
401 TextPosition m = strlen((char const *)pattern);
403 return numberOfAllTexts - numberOfTexts; // Empty texts.
405 TextPosition sp = 0, ep = 0;
406 SearchLessThan(pattern, m, &sp, &ep);
408 // Count end-markers in result interval
409 return CountEndmarkers(sp, ep, begin, end);
413 * Document reporting queries
415 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern) const
417 TextPosition m = strlen((char *)pattern);
419 return TextCollection::document_result(); // FIXME Should return all 1...k
421 TextPosition sp = 0, ep = 0;
422 Search(pattern, m, &sp, &ep);
424 TextCollection::document_result result;
426 // Report end-markers in result interval
427 unsigned resultSize = CountEndmarkers(sp, ep);
431 result.reserve(resultSize); // Try to avoid reallocation.
433 // Iterate through end-markers in [sp,ep]:
436 i = alphabetrank->rank(0, sp - 1);
439 // End-marker we found belongs to the "preceeding" doc in the collection
440 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
442 docId = emptyTextRank->select0(docId+1);
443 result.push_back(docId);
452 TextCollection::document_result TCImplementation::Prefix(uchar const * pattern, DocId begin, DocId end) const
454 TextPosition m = strlen((char *)pattern);
456 return TextCollection::document_result(); // FIXME Should return all 1...k
458 TextPosition sp = 0, ep = 0;
459 Search(pattern, m, &sp, &ep);
461 TextCollection::document_result result;
463 // Report end-markers in result interval
464 unsigned resultSize = CountEndmarkers(sp, ep);
468 result.reserve(resultSize); // Try to avoid reallocation.
470 // Iterate through end-markers in [sp,ep] and [begin, end]:
473 i = alphabetrank->rank(0, sp - 1);
476 // End-marker we found belongs to the "preceeding" doc in the collection
477 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
479 docId = emptyTextRank->select0(docId+1);
480 if (docId >= begin && docId <= end)
481 result.push_back(docId);
490 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern) const
492 TextPosition m = strlen((char *)pattern);
494 return TextCollection::document_result(); // FIXME Should return all 1...k
496 TextPosition sp = 0, ep = 0;
497 // Search with end-marker
498 Search(pattern, m+1, &sp, &ep);
500 TextCollection::document_result result;
501 result.reserve(ep-sp+1); // Try to avoid reallocation.
503 ulong sampled_rank_i = 0;
504 // Check each occurrence
505 for (; sp <= ep; ++sp)
509 uchar c = alphabetrank->access(i);
510 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
512 i = C[c]+alphabetrank->rank(c,i)-1;
513 c = alphabetrank->access(i);
515 // Assert: c == '\0' OR sampled->IsBitSet(i)
519 // Rank among the end-markers in BWT
520 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
522 // End-marker that we found belongs to the "preceeding" doc in collection:
523 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
525 docId = emptyTextRank->select0(docId+1);
526 result.push_back(docId);
528 else // Sampled position
530 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
532 docId = emptyTextRank->select0(docId+1);
533 result.push_back(docId);
540 TextCollection::document_result TCImplementation::Suffix(uchar const * pattern, DocId begin, DocId end) const
542 TextPosition m = strlen((char *)pattern);
544 return TextCollection::document_result(); // FIXME Should return all 1...k
546 TextPosition sp = 0, ep = 0;
547 // Search with end-marker
548 Search(pattern, m+1, &sp, &ep, begin, end);
550 TextCollection::document_result result;
551 result.reserve(ep-sp+1); // Try to avoid reallocation.
553 ulong sampled_rank_i = 0;
554 // Check each occurrence, already within [begin, end]
555 for (; sp <= ep; ++sp)
559 uchar c = alphabetrank->access(i);
560 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
562 i = C[c]+alphabetrank->rank(c,i)-1;
563 c = alphabetrank->access(i);
565 // Assert: c == '\0' OR sampled->IsBitSet(i)
569 // Rank among the end-markers in BWT
570 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
572 // End-marker that we found belongs to the "preceeding" doc in collection:
573 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
575 docId = emptyTextRank->select0(docId+1);
576 result.push_back(docId);
578 else // Sampled position
580 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
582 docId = emptyTextRank->select0(docId+1);
583 result.push_back(docId);
591 TextCollection::document_result TCImplementation::Equal(uchar const *pattern) const
593 TextPosition m = strlen((char const *)pattern);
595 return TextCollection::document_result(); // FIXME Should return all empty texts
597 TextPosition sp = 0, ep = 0;
598 // Match including end-marker
599 Search(pattern, m+1, &sp, &ep);
601 TextCollection::document_result result;
603 // Report end-markers in result interval
604 unsigned resultSize = CountEndmarkers(sp, ep);
608 result.reserve(resultSize); // Try to avoid reallocation.
612 i = alphabetrank->rank(0, sp - 1);
615 // End-marker we found belongs to the "previous" doc in the collection
616 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
618 docId = emptyTextRank->select0(docId+1);
619 result.push_back(docId);
628 TextCollection::document_result TCImplementation::Equal(uchar const *pattern, DocId begin, DocId end) const
630 TextPosition m = strlen((char const *)pattern);
632 return TextCollection::document_result(); // FIXME Should return all empty texts
634 TextPosition sp = 0, ep = 0;
635 // Match including end-marker
636 Search(pattern, m+1, &sp, &ep, begin, end);
638 TextCollection::document_result result;
640 // Report end-markers in result interval
641 unsigned resultSize = CountEndmarkers(sp, ep);
645 result.reserve(resultSize); // Try to avoid reallocation.
649 i = alphabetrank->rank(0, sp - 1);
652 // End-marker we found belongs to the "previous" doc in the collection
653 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
655 docId = emptyTextRank->select0(docId+1);
656 result.push_back(docId); // already within [begin, end]
666 TextCollection::document_result TCImplementation::Contains(uchar const * pattern) const
668 TextPosition m = strlen((char *)pattern);
670 return TextCollection::document_result();
672 TextPosition sp = 0, ep = 0;
673 // Search all occurrences
674 Search(pattern, m, &sp, &ep);
676 // We want unique document indentifiers, using std::set to collect them
677 std::set<DocId> resultSet;
679 ulong sampled_rank_i = 0;
680 // Check each occurrence
681 for (; sp <= ep; ++sp)
684 uchar c = alphabetrank->access(i);
685 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
687 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
688 c = alphabetrank->access(i);
692 // Rank among the end-markers in BWT
693 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
695 // End-marker that we found belongs to the "preceeding" doc in collection:
696 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
697 resultSet.insert(docId);
701 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
702 assert((unsigned)di < numberOfTexts);
703 resultSet.insert(di);
707 // Convert std::set to std::vector
708 TextCollection::document_result result(resultSet.begin(), resultSet.end());
710 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
711 *it = emptyTextRank->select0(*it+1);
715 TextCollection::document_result TCImplementation::Contains(uchar const * pattern, DocId begin, DocId end) const
717 TextPosition m = strlen((char *)pattern);
719 return TextCollection::document_result();
721 TextPosition sp = 0, ep = 0;
722 // Search all occurrences
723 Search(pattern, m, &sp, &ep);
725 // We want unique document indentifiers, using std::set to collect them
726 std::set<DocId> resultSet;
728 ulong sampled_rank_i = 0;
729 // Check each occurrence
730 for (; sp <= ep; ++sp)
733 uchar c = alphabetrank->access(i);
734 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
736 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
737 c = alphabetrank->access(i);
741 // Rank among the end-markers in BWT
742 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
744 // End-marker that we found belongs to the "preceeding" doc in collection:
745 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
746 if (docId >= begin && docId <= end)
747 resultSet.insert(docId);
751 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
752 assert((unsigned)docId < numberOfTexts);
753 if (docId >= begin && docId <= end)
754 resultSet.insert(docId);
758 // Convert std::set to std::vector
759 TextCollection::document_result result(resultSet.begin(), resultSet.end());
761 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
762 *it = emptyTextRank->select0(*it+1);
766 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern) const
768 TextPosition m = strlen((char *)pattern);
770 return TextCollection::document_result(); // empty result set
772 TextPosition sp = 0, ep = 0;
773 SearchLessThan(pattern, m, &sp, &ep);
775 TextCollection::document_result result;
777 // Report end-markers in result interval
778 unsigned resultSize = CountEndmarkers(sp, ep);
782 result.reserve(resultSize); // Try to avoid reallocation.
784 // Iterate through end-markers in [sp,ep]:
787 i = alphabetrank->rank(0, sp - 1);
790 // End-marker we found belongs to the "preceeding" doc in the collection
791 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
793 docId = emptyTextRank->select0(docId+1);
794 result.push_back(docId);
803 TextCollection::document_result TCImplementation::LessThan(uchar const * pattern, DocId begin, DocId end) const
805 TextPosition m = strlen((char *)pattern);
807 return TextCollection::document_result(); // empty result set
809 TextPosition sp = 0, ep = 0;
810 SearchLessThan(pattern, m, &sp, &ep);
812 TextCollection::document_result result;
814 // Report end-markers in result interval
815 unsigned resultSize = CountEndmarkers(sp, ep);
819 result.reserve(resultSize); // Try to avoid reallocation.
821 // Iterate through end-markers in [sp,ep] and [begin, end]:
824 i = alphabetrank->rank(0, sp - 1);
827 // End-marker we found belongs to the "preceeding" doc in the collection
828 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
830 docId = emptyTextRank->select0(docId+1);
831 if (docId >= begin && docId <= end)
832 result.push_back(docId);
842 * Full result set queries
844 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern) const
846 TextPosition m = strlen((char *)pattern);
848 return full_result(); // FIXME Throw exception?
850 TextPosition sp = 0, ep = 0;
851 // Search all occurrences
852 Search(pattern, m, &sp, &ep);
855 result.reserve(ep-sp+1); // Try to avoid reallocation.
857 ulong sampled_rank_i = 0;
858 // Report each occurrence
859 for (; sp <= ep; ++sp)
862 TextPosition dist = 0;
863 uchar c = alphabetrank->access(i);
864 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
866 i = C[c]+alphabetrank->rank(c,i)-1;
867 c = alphabetrank->access(i);
872 // Rank among the end-markers in BWT
873 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
875 // End-marker that we found belongs to the "preceeding" doc in collection:
876 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
878 docId = emptyTextRank->select0(docId+1);
879 result.push_back(make_pair(docId, dist));
883 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
884 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
885 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
888 docId = emptyTextRank->select0(docId+1);
889 result.push_back(make_pair(docId, textPos));
896 TextCollection::full_result TCImplementation::FullContains(uchar const * pattern, DocId begin, DocId end) const
898 TextPosition m = strlen((char *)pattern);
900 return full_result(); // FIXME Throw exception?
902 TextPosition sp = 0, ep = 0;
903 // Search all occurrences
904 Search(pattern, m, &sp, &ep);
907 result.reserve(ep-sp+1); // Try to avoid reallocation.
909 ulong sampled_rank_i = 0;
910 // Report each occurrence
911 for (; sp <= ep; ++sp)
914 TextPosition dist = 0;
915 uchar c = alphabetrank->access(i);
916 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
918 i = C[c]+alphabetrank->rank(c,i)-1;
919 c = alphabetrank->access(i);
924 // Rank among the end-markers in BWT
925 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
927 // End-marker that we found belongs to the "preceeding" doc in collection:
928 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
930 docId = emptyTextRank->select0(docId+1);
931 if (docId >= begin && docId <= end)
932 result.push_back(make_pair(docId, dist));
936 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
937 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
938 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
941 docId = emptyTextRank->select0(docId+1);
942 if (docId >= begin && docId <= end)
943 result.push_back(make_pair(docId, textPos));
952 * Save index to a file handle
954 * Throws a std::runtime_error exception on i/o error.
955 * First byte that is saved represents the version number of the save file.
956 * In version 2 files, the data fields are:
961 TextPosition bwtEndPos;
962 static_sequence * alphabetrank;
964 BlockArray *suffixes;
965 BlockArray *suffixDocId;
966 unsigned numberOfTexts;
967 unsigned numberOfAllTexts;
969 BlockArray *endmarkerDocId;
970 BSGAP *emptyTextRank;
972 void TCImplementation::Save(FILE *file) const
974 // Saving version info:
975 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
976 throw std::runtime_error("TCImplementation::Save(): file write error (version flag).");
978 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
979 throw std::runtime_error("TCImplementation::Save(): file write error (n).");
980 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
981 throw std::runtime_error("TCImplementation::Save(): file write error (samplerate).");
983 for(ulong i = 0; i < 256; ++i)
984 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
985 throw std::runtime_error("TCImplementation::Save(): file write error (C table).");
987 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
988 throw std::runtime_error("TCImplementation::Save(): file write error (bwt end position).");
990 alphabetrank->save(file);
992 suffixes->Save(file);
993 suffixDocId->Save(file);
995 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
996 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfTexts).");
997 if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
998 throw std::runtime_error("TCImplementation::Save(): file write error (numberOfAllTexts).");
999 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1000 throw std::runtime_error("TCImplementation::Save(): file write error (maxTextLength).");
1002 endmarkerDocId->Save(file);
1003 emptyTextRank->Save(file);
1009 * Load index from a file handle
1011 * Throws a std::runtime_error exception on i/o error.
1012 * For more info, see TCImplementation::Save().
1014 TCImplementation::TCImplementation(FILE *file, unsigned samplerate_)
1015 : n(0), samplerate(samplerate_), alphabetrank(0), sampled(0), suffixes(0),
1016 suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
1020 if (std::fread(&verFlag, 1, 1, file) != 1)
1021 throw std::runtime_error("TCImplementation::Load(): file read error (version flag).");
1022 if (verFlag != TCImplementation::versionFlag)
1023 throw std::runtime_error("TCImplementation::Load(): invalid save file version.");
1025 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
1026 throw std::runtime_error("TCImplementation::Load(): file read error (n).");
1027 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
1028 throw std::runtime_error("TCImplementation::Load(): file read error (samplerate).");
1029 // FIXME samplerate can not be changed during load.
1030 // if (this->samplerate == 0)
1031 // this->samplerate = samplerate;
1033 for(ulong i = 0; i < 256; ++i)
1034 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
1035 throw std::runtime_error("TCImplementation::Load(): file read error (C table).");
1037 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1038 throw std::runtime_error("TCImplementation::Load(): file read error (bwt end position).");
1040 alphabetrank = static_sequence::load(file);
1041 sampled = new BSGAP(file);
1042 suffixes = new BlockArray(file);
1043 suffixDocId = new BlockArray(file);
1045 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1046 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfTexts).");
1047 if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1048 throw std::runtime_error("TCImplementation::Load(): file read error (numberOfAllTexts).");
1049 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1050 throw std::runtime_error("TCImplementation::Load(): file read error (maxTextLength).");
1052 endmarkerDocId = new BlockArray(file);
1053 emptyTextRank = new BSGAP(file);
1055 // FIXME Construct data structures with new samplerate
1062 * Rest of the functions follow...
1066 // FIXME Use 2D-search structure
1067 unsigned TCImplementation::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
1074 ranksp = alphabetrank->rank(0, sp - 1);
1076 unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
1077 if (resultSize == 0)
1080 // Count end-markers in result interval and within [begin, end]
1082 unsigned i = ranksp;
1085 // End-marker we found belongs to the "previous" doc in the collection
1086 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1088 docId = emptyTextRank->select0(docId+1);
1089 if (docId >= begin && docId <= end)
1098 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1100 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1101 int c = (int)pattern[m-1];
1103 TextPosition sp = C[c];
1104 TextPosition ep = C[c+1]-1;
1105 while (sp<=ep && i>=1)
1107 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1108 c = (int)pattern[--i];
1109 sp = C[c]+alphabetrank->rank(c,sp-1);
1110 ep = C[c]+alphabetrank->rank(c,ep)-1;
1120 ulong TCImplementation::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
1122 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1123 int c = (int)pattern[m-1];
1124 assert(c == 0); // Start from endmarkers
1126 TextPosition sp = begin;
1127 TextPosition ep = end;
1128 while (sp<=ep && i>=1)
1130 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1131 c = (int)pattern[--i];
1132 sp = C[c]+alphabetrank->rank(c,sp-1);
1133 ep = C[c]+alphabetrank->rank(c,ep)-1;
1144 ulong TCImplementation::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1146 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1147 uint c = (int)pattern[m-1];
1149 TextPosition sp = 1;
1150 TextPosition ep = C[c+1]-1;
1151 while (sp<=ep && i>=1)
1153 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1154 c = (int)pattern[--i];
1155 uint result = alphabetrank->rank(c,ep);
1170 TCImplementation::~TCImplementation() {
1174 delete alphabetrank;
1179 delete endmarkerDocId;
1180 delete emptyTextRank;
1183 void TCImplementation::makewavelet(uchar *bwt)
1192 if (C[i]>0) {min = i; break;}
1193 for (i=255;i>=min;--i)
1194 if (C[i]>0) {max = i; break;}
1196 ulong prev=C[0], temp;
1198 for (i=1;i<256;i++) {
1203 // this->codetable = node::makecodetable(bwt,n);
1204 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1206 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1207 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1208 // std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1210 // HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1212 alphabet_mapper * am = new alphabet_mapper_none();
1213 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1214 // static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1215 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1216 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1218 bwt = 0; // already deleted
1220 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1221 // std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1224 void TCImplementation::maketables()
1226 // Calculate BWT end-marker position (of last inserted text)
1229 uint alphabetrank_i_tmp = 0;
1230 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
1233 i = C[c]+alphabetrank_i_tmp-1;
1234 c = alphabetrank->access(i, alphabetrank_i_tmp);
1237 this->bwtEndPos = i;
1240 // Build up arrays for text length and starting positions
1241 // FIXME Temp, remove
1242 //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1243 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1244 //(*textLength)[0] = l;
1245 (*textStartPos)[0] = 0;
1247 // Construct samples
1248 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1249 unsigned ceilLog2n = Tools::CeilLog2(n);
1250 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1251 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1253 // Mapping from end-markers to doc ID's:
1254 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1256 ulong *sampledpositions = new ulong[n/W+1];
1257 for (ulong i=0;i<n/W+1;i++)
1258 sampledpositions[i]=0lu;
1260 ulong x,p=bwtEndPos;
1261 ulong sampleCount = 0;
1262 // Keeping track of text position of prev. end-marker seen
1263 ulong posOfSuccEndmarker = n;
1264 DocId textId = numberOfTexts;
1267 uint alphabetrank_i_tmp =0;
1270 for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1271 // i substitutes SA->GetPos(i)
1274 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1275 Tools::SetField(sampledpositions,1,p,1);
1276 (*positions)[sampleCount] = p;
1277 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1281 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1286 // Record the order of end-markers in BWT:
1287 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1288 (*endmarkerDocId)[endmarkerRank] = textId;
1290 // Store text length and text start position:
1291 if (textId < (DocId)numberOfTexts - 1)
1293 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1294 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1295 posOfSuccEndmarker = x;
1298 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1299 p = textId; // Correct LF-mapping to the last char of the previous text.
1302 p = C[c]+alphabetrank_i_tmp-1;
1304 assert(textId == 0);
1306 sampled = new BSGAP(sampledpositions,n,true);
1307 sampleLength = sampled->rank(n-1);
1308 assert(sampleCount == sampleLength);
1310 // Suffixes store an offset from the text start position
1311 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1312 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1314 for(ulong i=0; i<sampleLength; i++) {
1315 assert((*positions)[i] < n);
1316 ulong j = sampled->rank((*positions)[i]);
1317 if (j==0) j=sampleLength;
1318 TextPosition textPos = (*tmpSuffix)[i];
1319 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1321 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1322 assert((*suffixDocId)[j-1] < numberOfTexts);
1323 // calculate offset from text start:
1324 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1326 // FIXME Temp, remove
1329 // delete textLength;
1330 delete textStartPos;
1335 * Finds document identifier for given text position
1337 * Starting text position of the document is stored into second parameter.
1338 * Binary searching on text starting positions.
1340 TextCollection::DocId TCImplementation::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1345 DocId b = numberOfTexts - 1;
1348 DocId c = a + (b - a)/2;
1349 if ((*textStartPos)[c] > i)
1351 else if ((*textStartPos)[c+1] > i)
1357 assert(a < (DocId)numberOfTexts);
1358 assert(i >= (*textStartPos)[a]);
1359 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));