1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
23 // Conflicts with std::min and std::max
26 #include "dcover/bwt.hpp"
28 #include "HeapProfiler.h" // FIXME remove
37 #include <cstring> // For strlen()
46 // Save file version info
47 const uchar CSA::versionFlag = 2;
49 ////////////////////////////////////////////////////////////////////////////
50 // Class CSA::THuffAlphabetRank
52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
58 this->codetable = codetable;
60 bool *B = new bool[n];
63 for (i=0; i< n; i++) {
64 printf("%c:", (char)((int)s[i]-128));
65 for (r=0;r<codetable[(int)s[i]].bits;r++)
66 if (codetable[(int)s[i]].code & (1u <<r))
72 if (level > 100) return;*/
74 if (codetable[(int)s[i]].code & (1u << level)) {
79 if (sum==0 || sum==n) {
84 uchar *sfirst, *ssecond;
86 sfirst = new uchar[n-sum];
88 ssecond = new uchar[sum];
91 if (B[i]) ssecond[k++] = s[i];
92 else sfirst[j++] = s[i];
93 ulong *Binbits = new ulong[n/W+1];
95 Tools::SetField(Binbits,1,i,B[i]);
97 bitrank = new BitRank(Binbits,n,true);
99 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
103 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
110 // testing that the code works correctly
118 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
120 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
127 if (left!=NULL) delete left;
128 if (right!=NULL) delete right;
134 ////////////////////////////////////////////////////////////////////////////
138 * Constructor inits an empty dynamic FM-index.
139 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
141 CSA::CSA(unsigned samplerate)
142 : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
143 suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
146 this->samplerate = samplerate;
149 // FIXME TODO : DynFMI needs distribution of characters before hand
150 // This will create fully balanced wavelet tree for all chars [0, 255].
152 for (unsigned i = 0; i < 255; ++i)
155 dynFMI = new DynFMI(temp, 1, 255, false);
157 /* Debug code: take char distribution from data.txt.
158 uchar *temp = Tools::GetFileContents("data.txt", 0);
159 dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
167 * Given text must include end-marker.
168 * Text identifiers are numbered starting from 0.
170 void CSA::InsertText(uchar const * text)
177 TextPosition m = std::strlen((char *)text) + 1;
178 if (m > maxTextLength)
179 maxTextLength = m; // Store length of the longest text seen so far.
184 this->numberOfTexts ++;
185 this->numberOfAllTexts ++;
187 dynFMI->addText(text, m);
190 texts.append((char *) text);
191 texts.append(1, '\0');
195 emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
196 this->numberOfAllTexts ++;
200 void CSA::MakeStatic()
205 throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
208 if (texts.size() == 0) // Empty collection
211 texts.append(1, '\0');
214 // Bitvector of empty texts, FIXME Remove?
216 //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
217 ulong *tempB = new ulong[numberOfAllTexts/W + 1];
218 for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
220 for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
221 Tools::SetField(tempB, 1, (*it), 1);
223 emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
226 texts.reserve(0); // Release extra capacity
228 /* FILE *fp = fopen("texts.txt", "wb");
229 std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
232 uchar *bwt = new uchar[n];
234 // More succinct solution via StringIterator, see below.
235 /* unsigned* maptexts = new unsigned[n+1];
236 // Map text chars to [0..255+numberOfTexts]
238 for (ulong i = 0; i < n; ++i)
240 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
242 uchar c = (uchar)texts[i];
243 maptexts[i] = (unsigned)c + numberOfTexts;
246 assert(count == numberOfTexts);
248 std::cout << "maptext: ";
249 for (ulong i = 0; i <= n; ++i)
250 std::cout << maptexts[i] << ", ";
251 std::cout << std::endl;*/
253 // Mark endmarker positions into bitvector
254 ulong * endmarkers = new ulong[n/W+1];
255 for (ulong i = 0; i < n/W+1; ++i)
257 for (ulong i = 0; i < n; ++i)
259 Tools::SetField(endmarkers, 1, i, 1);
260 BitRank* br = new BitRank(endmarkers, n, true);
261 assert(numberOfTexts == br->rank(n-1));
263 // Build iterators, FIXME clean up iterator construction.
264 StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
265 StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
267 bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
268 &bwt[0], numberOfTexts);
270 bwt[--bwtEndPos] = '\0';
272 texts.reserve(0); // Release capacity
273 delete br; // bitvector of endmarkers
274 } // End of bw transform
275 // std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
277 /* fp = fopen("bwt.txt", "wb");
278 std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
284 uchar *bwtTest = dynFMI->getBWT();
285 printf("123456789012345678901234567890123456789\n");
286 for (TextPosition i = 0; i < n && i < 100; i ++)
288 printf("%d", (int)bwt[i]);
290 printf("%c", bwt[i]);
292 for (TextPosition i = 0; i < n && i < 100; i ++)
294 printf("%d", (int)bwtTest[i]);
296 printf("%c", bwtTest[i]);
300 assert(numberOfTexts == dynFMI->getCollectionSize());
304 for (ulong i = 0; i < n; ++i)
305 if (bwt[i] != bwtTest[i])
307 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
312 #endif // CSA_TEST_BWT
314 makewavelet(bwt); // Deletes bwt!
317 // Make sampling tables
321 bool CSA::EmptyText(DocId k) const
323 assert(k < (DocId)numberOfTexts);
324 if (emptyTextRank->IsBitSet(k))
329 uchar* CSA::GetText(DocId k) const
331 assert(k < (DocId)numberOfTexts);
334 if (emptyTextRank->IsBitSet(k, &textRank))
336 uchar* result = new uchar[1];
340 // Map to non-empty text
341 k -= textRank; //emptyTextRank->rank(k);
346 // Reserve average string length to avoid reallocs
347 result.reserve(n/numberOfTexts);
349 uchar c = alphabetrank->access(i);
353 i = C[c]+alphabetrank->rank(c,i)-1;
355 c = alphabetrank->access(i); // "next" char.
358 // Convert to uchar (FIXME return string?)
360 uchar* res = new uchar[i+1];
362 for (ulong j = 0; j < i; ++j)
363 res[i-j-1] = result[j];
368 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
370 assert(k < (DocId)numberOfTexts);
371 assert(j < (*textLength)[k]);
375 if (emptyTextRank->IsBitSet(k, &textRank))
377 uchar* result = new uchar[1];
382 // Map to non-empty text
383 k -= textRank; // emptyTextRank->rank(k);
385 // Start position of k'th text
386 ulong start = (*textStartPos)[k];
388 return Substring(i + start, j-i+1);
391 /******************************************************************
392 * Existential queries
394 bool CSA::IsPrefix(uchar const * pattern) const
396 TextPosition m = strlen((char *)pattern);
400 TextPosition sp = 0, ep = 0;
401 Search(pattern, m, &sp, &ep);
403 // Check for end-marker(s) in result interval
404 if (CountEndmarkers(sp, ep))
409 bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
411 TextPosition m = strlen((char *)pattern);
415 TextPosition sp = 0, ep = 0;
416 Search(pattern, m, &sp, &ep);
418 // Check for end-marker(s) in result interval
419 if (CountEndmarkers(sp, ep, begin, end))
425 bool CSA::IsSuffix(uchar const *pattern) const
427 // Here counting is as fast as IsSuffix():
428 if (CountSuffix(pattern) > 0)
433 bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
435 // Here counting is as fast as IsSuffix():
436 if (CountSuffix(pattern, begin, end) > 0)
441 bool CSA::IsEqual(uchar const *pattern) const
443 TextPosition m = std::strlen((char *)pattern);
446 if (numberOfAllTexts - numberOfTexts > 0u)
447 return true; // Empty texts exists
448 return false; // No empty texts exists
451 TextPosition sp = 0, ep = 0;
452 // Match including end-marker
453 Search(pattern, m+1, &sp, &ep);
455 // Check for end-marker(s) in result interval
456 if (CountEndmarkers(sp, ep))
461 bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
463 TextPosition m = std::strlen((char *)pattern);
466 if (numberOfAllTexts - numberOfTexts > 0u)
467 return true; // Empty texts exists
468 return false; // No empty texts exists
471 TextPosition sp = 0, ep = 0;
472 // Match including end-marker
473 Search(pattern, m+1, &sp, &ep, begin, end);
475 // Check for end-marker(s) in result interval
476 if (CountEndmarkers(sp, ep))
481 bool CSA::IsContains(uchar const * pattern) const
483 TextPosition m = strlen((char *)pattern);
487 TextPosition sp = 0, ep = 0;
488 // Just check if pattern exists somewhere
489 ulong count = Search(pattern, m, &sp, &ep);
496 bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
498 // Here counting is as fast as existential querying
499 if (CountContains(pattern, begin, end) > 0)
500 return true; // FIXME No need to filter result set
504 bool CSA::IsLessThan(uchar const * pattern) const
506 if (CountLessThan(pattern) > 0)
511 bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
513 if (CountLessThan(pattern, begin, end) > 0)
518 /******************************************************************
521 ulong CSA::Count(uchar const * pattern) const
523 TextPosition m = strlen((char *)pattern);
527 TextPosition sp = 0, ep = 0;
528 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
532 unsigned CSA::CountPrefix(uchar const * pattern) const
534 TextPosition m = strlen((char *)pattern);
536 return numberOfAllTexts;
538 TextPosition sp = 0, ep = 0;
539 Search(pattern, m, &sp, &ep);
541 // Count end-markers in result interval
542 return CountEndmarkers(sp, ep);
545 unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
547 TextPosition m = strlen((char *)pattern);
549 return numberOfAllTexts;
551 TextPosition sp = 0, ep = 0;
552 Search(pattern, m, &sp, &ep);
554 // Count end-markers in result interval
555 return CountEndmarkers(sp, ep, begin, end);
558 unsigned CSA::CountSuffix(uchar const * pattern) const
560 TextPosition m = strlen((char *)pattern);
562 return numberOfAllTexts;
564 TextPosition sp = 0, ep = 0;
565 // Search with end-marker
566 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
571 unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
573 TextPosition m = strlen((char *)pattern);
575 return numberOfAllTexts;
577 TextPosition sp = 0, ep = 0;
578 // Search with end-marker
579 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
584 unsigned CSA::CountEqual(uchar const *pattern) const
586 TextPosition m = strlen((char const *)pattern);
588 return numberOfAllTexts - numberOfTexts; // Empty texts.
590 TextPosition sp = 0, ep = 0;
591 // Match including end-marker
592 Search(pattern, m+1, &sp, &ep);
594 // Count end-markers in result interval
595 return CountEndmarkers(sp, ep);
598 unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
600 TextPosition m = strlen((char const *)pattern);
602 return numberOfAllTexts - numberOfTexts; // Empty texts.
604 TextPosition sp = 0, ep = 0;
605 // Match including end-marker
606 Search(pattern, m+1, &sp, &ep, begin, end);
608 // Count end-markers in result interval
609 return CountEndmarkers(sp, ep); // Already within [begin, end]
612 unsigned CSA::CountContains(uchar const * pattern) const
614 TextPosition m = strlen((char const *)pattern);
616 return numberOfAllTexts; // Total number of texts.
618 // Here counting is as slow as fetching the result set
619 // because we have to filter out occ's that fall within same document.
620 TextCollection::document_result result = Contains(pattern);
621 return result.size();
624 unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
626 TextPosition m = strlen((char const *)pattern);
628 return numberOfAllTexts; // Total number of texts.
630 // Here counting is as slow as fetching the result set
631 // because we have to filter out occ's that fall within same document.
632 TextCollection::document_result result = Contains(pattern, begin, end);
633 return result.size();
636 // Less than or equal
637 unsigned CSA::CountLessThan(uchar const * pattern) const
639 TextPosition m = strlen((char const *)pattern);
641 return numberOfAllTexts - numberOfTexts; // Empty texts.
643 TextPosition sp = 0, ep = 0;
644 SearchLessThan(pattern, m, &sp, &ep);
646 // Count end-markers in result interval
647 return CountEndmarkers(sp, ep);
650 unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
652 TextPosition m = strlen((char const *)pattern);
654 return numberOfAllTexts - numberOfTexts; // Empty texts.
656 TextPosition sp = 0, ep = 0;
657 SearchLessThan(pattern, m, &sp, &ep);
659 // Count end-markers in result interval
660 return CountEndmarkers(sp, ep, begin, end);
664 * Document reporting queries
666 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
668 TextPosition m = strlen((char *)pattern);
670 return TextCollection::document_result(); // FIXME Should return all 1...k
672 TextPosition sp = 0, ep = 0;
673 Search(pattern, m, &sp, &ep);
675 TextCollection::document_result result;
677 // Report end-markers in result interval
678 unsigned resultSize = CountEndmarkers(sp, ep);
682 result.reserve(resultSize); // Try to avoid reallocation.
684 // Iterate through end-markers in [sp,ep]:
687 i = alphabetrank->rank(0, sp - 1);
690 // End-marker we found belongs to the "preceeding" doc in the collection
691 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
693 docId = emptyTextRank->select0(docId+1);
694 result.push_back(docId);
703 TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
705 TextPosition m = strlen((char *)pattern);
707 return TextCollection::document_result(); // FIXME Should return all 1...k
709 TextPosition sp = 0, ep = 0;
710 Search(pattern, m, &sp, &ep);
712 TextCollection::document_result result;
714 // Report end-markers in result interval
715 unsigned resultSize = CountEndmarkers(sp, ep);
719 result.reserve(resultSize); // Try to avoid reallocation.
721 // Iterate through end-markers in [sp,ep] and [begin, end]:
724 i = alphabetrank->rank(0, sp - 1);
727 // End-marker we found belongs to the "preceeding" doc in the collection
728 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
730 docId = emptyTextRank->select0(docId+1);
731 if (docId >= begin && docId <= end)
732 result.push_back(docId);
741 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
743 TextPosition m = strlen((char *)pattern);
745 return TextCollection::document_result(); // FIXME Should return all 1...k
747 TextPosition sp = 0, ep = 0;
748 // Search with end-marker
749 Search(pattern, m+1, &sp, &ep);
751 TextCollection::document_result result;
752 result.reserve(ep-sp+1); // Try to avoid reallocation.
754 ulong sampled_rank_i = 0;
755 // Check each occurrence
756 for (; sp <= ep; ++sp)
760 uchar c = alphabetrank->access(i);
761 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
763 i = C[c]+alphabetrank->rank(c,i)-1;
764 c = alphabetrank->access(i);
766 // Assert: c == '\0' OR sampled->IsBitSet(i)
770 // Rank among the end-markers in BWT
771 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
773 // End-marker that we found belongs to the "preceeding" doc in collection:
774 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
776 docId = emptyTextRank->select0(docId+1);
777 result.push_back(docId);
779 else // Sampled position
781 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
783 docId = emptyTextRank->select0(docId+1);
784 result.push_back(docId);
791 TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
793 TextPosition m = strlen((char *)pattern);
795 return TextCollection::document_result(); // FIXME Should return all 1...k
797 TextPosition sp = 0, ep = 0;
798 // Search with end-marker
799 Search(pattern, m+1, &sp, &ep, begin, end);
801 TextCollection::document_result result;
802 result.reserve(ep-sp+1); // Try to avoid reallocation.
804 ulong sampled_rank_i = 0;
805 // Check each occurrence, already within [begin, end]
806 for (; sp <= ep; ++sp)
810 uchar c = alphabetrank->access(i);
811 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
813 i = C[c]+alphabetrank->rank(c,i)-1;
814 c = alphabetrank->access(i);
816 // Assert: c == '\0' OR sampled->IsBitSet(i)
820 // Rank among the end-markers in BWT
821 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
823 // End-marker that we found belongs to the "preceeding" doc in collection:
824 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
826 docId = emptyTextRank->select0(docId+1);
827 result.push_back(docId);
829 else // Sampled position
831 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
833 docId = emptyTextRank->select0(docId+1);
834 result.push_back(docId);
842 TextCollection::document_result CSA::Equal(uchar const *pattern) const
844 TextPosition m = strlen((char const *)pattern);
846 return TextCollection::document_result(); // FIXME Should return all empty texts
848 TextPosition sp = 0, ep = 0;
849 // Match including end-marker
850 Search(pattern, m+1, &sp, &ep);
852 TextCollection::document_result result;
854 // Report end-markers in result interval
855 unsigned resultSize = CountEndmarkers(sp, ep);
859 result.reserve(resultSize); // Try to avoid reallocation.
863 i = alphabetrank->rank(0, sp - 1);
866 // End-marker we found belongs to the "previous" doc in the collection
867 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
869 docId = emptyTextRank->select0(docId+1);
870 result.push_back(docId);
879 TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
881 TextPosition m = strlen((char const *)pattern);
883 return TextCollection::document_result(); // FIXME Should return all empty texts
885 TextPosition sp = 0, ep = 0;
886 // Match including end-marker
887 Search(pattern, m+1, &sp, &ep, begin, end);
889 TextCollection::document_result result;
891 // Report end-markers in result interval
892 unsigned resultSize = CountEndmarkers(sp, ep);
896 result.reserve(resultSize); // Try to avoid reallocation.
900 i = alphabetrank->rank(0, sp - 1);
903 // End-marker we found belongs to the "previous" doc in the collection
904 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
906 docId = emptyTextRank->select0(docId+1);
907 result.push_back(docId); // already within [begin, end]
917 TextCollection::document_result CSA::Contains(uchar const * pattern) const
919 TextPosition m = strlen((char *)pattern);
921 return TextCollection::document_result();
923 TextPosition sp = 0, ep = 0;
924 // Search all occurrences
925 Search(pattern, m, &sp, &ep);
927 // We want unique document indentifiers, using std::set to collect them
928 std::set<DocId> resultSet;
930 ulong sampled_rank_i = 0;
931 // Check each occurrence
932 for (; sp <= ep; ++sp)
935 uchar c = alphabetrank->access(i);
936 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
938 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
939 c = alphabetrank->access(i);
943 // Rank among the end-markers in BWT
944 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
946 // End-marker that we found belongs to the "preceeding" doc in collection:
947 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
948 resultSet.insert(docId);
952 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
953 assert((unsigned)di < numberOfTexts);
954 resultSet.insert(di);
958 // Convert std::set to std::vector
959 TextCollection::document_result result(resultSet.begin(), resultSet.end());
961 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
962 *it = emptyTextRank->select0(*it+1);
966 TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
968 TextPosition m = strlen((char *)pattern);
970 return TextCollection::document_result();
972 TextPosition sp = 0, ep = 0;
973 // Search all occurrences
974 Search(pattern, m, &sp, &ep);
976 // We want unique document indentifiers, using std::set to collect them
977 std::set<DocId> resultSet;
979 ulong sampled_rank_i = 0;
980 // Check each occurrence
981 for (; sp <= ep; ++sp)
984 uchar c = alphabetrank->access(i);
985 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
987 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
988 c = alphabetrank->access(i);
992 // Rank among the end-markers in BWT
993 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
995 // End-marker that we found belongs to the "preceeding" doc in collection:
996 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
997 if (docId >= begin && docId <= end)
998 resultSet.insert(docId);
1002 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1003 assert((unsigned)docId < numberOfTexts);
1004 if (docId >= begin && docId <= end)
1005 resultSet.insert(docId);
1009 // Convert std::set to std::vector
1010 TextCollection::document_result result(resultSet.begin(), resultSet.end());
1012 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
1013 *it = emptyTextRank->select0(*it+1);
1017 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
1019 TextPosition m = strlen((char *)pattern);
1021 return TextCollection::document_result(); // empty result set
1023 TextPosition sp = 0, ep = 0;
1024 SearchLessThan(pattern, m, &sp, &ep);
1026 TextCollection::document_result result;
1028 // Report end-markers in result interval
1029 unsigned resultSize = CountEndmarkers(sp, ep);
1030 if (resultSize == 0)
1033 result.reserve(resultSize); // Try to avoid reallocation.
1035 // Iterate through end-markers in [sp,ep]:
1038 i = alphabetrank->rank(0, sp - 1);
1041 // End-marker we found belongs to the "preceeding" doc in the collection
1042 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1044 docId = emptyTextRank->select0(docId+1);
1045 result.push_back(docId);
1054 TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
1056 TextPosition m = strlen((char *)pattern);
1058 return TextCollection::document_result(); // empty result set
1060 TextPosition sp = 0, ep = 0;
1061 SearchLessThan(pattern, m, &sp, &ep);
1063 TextCollection::document_result result;
1065 // Report end-markers in result interval
1066 unsigned resultSize = CountEndmarkers(sp, ep);
1067 if (resultSize == 0)
1070 result.reserve(resultSize); // Try to avoid reallocation.
1072 // Iterate through end-markers in [sp,ep] and [begin, end]:
1075 i = alphabetrank->rank(0, sp - 1);
1078 // End-marker we found belongs to the "preceeding" doc in the collection
1079 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1081 docId = emptyTextRank->select0(docId+1);
1082 if (docId >= begin && docId <= end)
1083 result.push_back(docId);
1093 * Full result set queries
1095 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
1097 TextPosition m = strlen((char *)pattern);
1099 return full_result(); // FIXME Throw exception?
1101 TextPosition sp = 0, ep = 0;
1102 // Search all occurrences
1103 Search(pattern, m, &sp, &ep);
1106 result.reserve(ep-sp+1); // Try to avoid reallocation.
1108 ulong sampled_rank_i = 0;
1109 // Report each occurrence
1110 for (; sp <= ep; ++sp)
1112 TextPosition i = sp;
1113 TextPosition dist = 0;
1114 uchar c = alphabetrank->access(i);
1115 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1117 i = C[c]+alphabetrank->rank(c,i)-1;
1118 c = alphabetrank->access(i);
1123 // Rank among the end-markers in BWT
1124 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1126 // End-marker that we found belongs to the "preceeding" doc in collection:
1127 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1129 docId = emptyTextRank->select0(docId+1);
1130 result.push_back(make_pair(docId, dist));
1134 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1135 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1136 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1139 docId = emptyTextRank->select0(docId+1);
1140 result.push_back(make_pair(docId, textPos));
1147 TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
1149 TextPosition m = strlen((char *)pattern);
1151 return full_result(); // FIXME Throw exception?
1153 TextPosition sp = 0, ep = 0;
1154 // Search all occurrences
1155 Search(pattern, m, &sp, &ep);
1158 result.reserve(ep-sp+1); // Try to avoid reallocation.
1160 ulong sampled_rank_i = 0;
1161 // Report each occurrence
1162 for (; sp <= ep; ++sp)
1164 TextPosition i = sp;
1165 TextPosition dist = 0;
1166 uchar c = alphabetrank->access(i);
1167 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1169 i = C[c]+alphabetrank->rank(c,i)-1;
1170 c = alphabetrank->access(i);
1175 // Rank among the end-markers in BWT
1176 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1178 // End-marker that we found belongs to the "preceeding" doc in collection:
1179 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1181 docId = emptyTextRank->select0(docId+1);
1182 if (docId >= begin && docId <= end)
1183 result.push_back(make_pair(docId, dist));
1187 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1188 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1189 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1192 docId = emptyTextRank->select0(docId+1);
1193 if (docId >= begin && docId <= end)
1194 result.push_back(make_pair(docId, textPos));
1203 * Save index to a file handle
1205 * Throws a std::runtime_error exception on i/o error.
1206 * First byte that is saved represents the version number of the save file.
1207 * In version 2 files, the data fields are:
1208 * uchar versionFlag;
1210 unsigned samplerate;
1212 TextPosition bwtEndPos;
1213 static_sequence * alphabetrank;
1215 BlockArray *suffixes;
1216 BlockArray *suffixDocId;
1217 unsigned numberOfTexts;
1218 unsigned numberOfAllTexts;
1219 ulong maxTextLength;
1220 BlockArray *endmarkerDocId;
1221 BSGAP *emptyTextRank;
1223 void CSA::Save(FILE *file) const
1225 // Saving version info:
1226 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
1227 throw std::runtime_error("CSA::Save(): file write error (version flag).");
1229 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
1230 throw std::runtime_error("CSA::Save(): file write error (n).");
1231 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
1232 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
1234 for(ulong i = 0; i < 256; ++i)
1235 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
1236 throw std::runtime_error("CSA::Save(): file write error (C table).");
1238 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1239 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
1241 alphabetrank->save(file);
1242 sampled->Save(file);
1243 suffixes->Save(file);
1244 suffixDocId->Save(file);
1246 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1247 throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
1248 if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1249 throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
1250 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1251 throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
1253 endmarkerDocId->Save(file);
1254 emptyTextRank->Save(file);
1260 * Load index from a file handle
1262 * Throws a std::runtime_error exception on i/o error.
1263 * For more info, see CSA::Save().
1265 void CSA::Load(FILE *file, unsigned samplerate)
1267 // Delete everything
1269 delete dynFMI; dynFMI = 0;
1271 delete alphabetrank; alphabetrank = 0;
1272 delete sampled; sampled = 0;
1273 delete suffixes; suffixes = 0;
1274 delete suffixDocId; suffixDocId = 0;
1275 delete [] codetable; codetable = 0;
1277 delete endmarkerDocId; endmarkerDocId = 0;
1278 delete emptyTextRank; emptyTextRank = 0;
1280 this->maxTextLength = 0;
1281 this->numberOfTexts = 0;
1282 this->numberOfAllTexts = 0;
1283 this->samplerate = samplerate;
1287 if (std::fread(&verFlag, 1, 1, file) != 1)
1288 throw std::runtime_error("CSA::Load(): file read error (version flag).");
1289 if (verFlag != CSA::versionFlag)
1290 throw std::runtime_error("CSA::Load(): invalid save file version.");
1292 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
1293 throw std::runtime_error("CSA::Load(): file read error (n).");
1294 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
1295 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
1296 // FIXME samplerate can not be changed during load.
1297 // if (this->samplerate == 0)
1298 // this->samplerate = samplerate;
1300 for(ulong i = 0; i < 256; ++i)
1301 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
1302 throw std::runtime_error("CSA::Load(): file read error (C table).");
1304 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1305 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
1307 alphabetrank = static_sequence::load(file);
1308 sampled = new BSGAP(file);
1309 suffixes = new BlockArray(file);
1310 suffixDocId = new BlockArray(file);
1312 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1313 throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
1314 if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1315 throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
1316 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1317 throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
1319 endmarkerDocId = new BlockArray(file);
1320 emptyTextRank = new BSGAP(file);
1322 // FIXME Construct data structures with new samplerate
1329 * Rest of the functions follow...
1333 // FIXME Use 2D-search structure
1334 unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
1341 ranksp = alphabetrank->rank(0, sp - 1);
1343 unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
1344 if (resultSize == 0)
1347 // Count end-markers in result interval and within [begin, end]
1349 unsigned i = ranksp;
1352 // End-marker we found belongs to the "previous" doc in the collection
1353 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1355 docId = emptyTextRank->select0(docId+1);
1356 if (docId >= begin && docId <= end)
1365 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1367 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1368 int c = (int)pattern[m-1];
1370 TextPosition sp = C[c];
1371 TextPosition ep = C[c+1]-1;
1372 while (sp<=ep && i>=1)
1374 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1375 c = (int)pattern[--i];
1376 sp = C[c]+alphabetrank->rank(c,sp-1);
1377 ep = C[c]+alphabetrank->rank(c,ep)-1;
1387 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
1389 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1390 int c = (int)pattern[m-1];
1391 assert(c == 0); // Start from endmarkers
1393 TextPosition sp = begin;
1394 TextPosition ep = end;
1395 while (sp<=ep && i>=1)
1397 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1398 c = (int)pattern[--i];
1399 sp = C[c]+alphabetrank->rank(c,sp-1);
1400 ep = C[c]+alphabetrank->rank(c,ep)-1;
1411 ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1413 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1414 uint c = (int)pattern[m-1];
1416 TextPosition sp = 1;
1417 TextPosition ep = C[c+1]-1;
1418 while (sp<=ep && i>=1)
1420 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1421 c = (int)pattern[--i];
1422 uint result = alphabetrank->rank(c,ep);
1441 delete alphabetrank;
1445 delete [] codetable; // FIXME remove
1447 delete endmarkerDocId;
1448 delete emptyTextRank;
1451 void CSA::makewavelet(uchar *bwt)
1460 if (C[i]>0) {min = i; break;}
1461 for (i=255;i>=min;--i)
1462 if (C[i]>0) {max = i; break;}
1464 ulong prev=C[0], temp;
1466 for (i=1;i<256;i++) {
1471 // this->codetable = node::makecodetable(bwt,n);
1472 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1474 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1475 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1476 // std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1478 HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1480 alphabet_mapper * am = new alphabet_mapper_none();
1481 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1482 // static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1483 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1484 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1486 bwt = 0; // already deleted
1488 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1489 // std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1492 void CSA::maketables()
1494 // Calculate BWT end-marker position (of last inserted text)
1495 // Note: bwtEndPos already set!
1496 // and the length of the first text (counter l):
1501 uint alphabetrank_i_tmp = 0;
1502 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
1505 i = C[c]+alphabetrank_i_tmp-1;
1507 c = alphabetrank->access(i, alphabetrank_i_tmp);
1510 if (i != bwtEndPos) // compare result
1512 std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
1518 // Build up arrays for text length and starting positions
1519 // FIXME Temp, remove
1520 //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1521 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1522 //(*textLength)[0] = l;
1523 (*textStartPos)[0] = 0;
1525 // Construct samples
1526 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1527 unsigned ceilLog2n = Tools::CeilLog2(n);
1528 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1529 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1531 // Mapping from end-markers to doc ID's:
1532 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1534 ulong *sampledpositions = new ulong[n/W+1];
1535 for (ulong i=0;i<n/W+1;i++)
1536 sampledpositions[i]=0lu;
1538 ulong x,p=bwtEndPos;
1539 ulong sampleCount = 0;
1540 // Keeping track of text position of prev. end-marker seen
1541 ulong posOfSuccEndmarker = n;
1542 DocId textId = numberOfTexts;
1545 uint alphabetrank_i_tmp =0;
1548 for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1549 // i substitutes SA->GetPos(i)
1552 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1553 Tools::SetField(sampledpositions,1,p,1);
1554 (*positions)[sampleCount] = p;
1555 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1559 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1564 // Record the order of end-markers in BWT:
1565 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1566 (*endmarkerDocId)[endmarkerRank] = textId;
1568 // Store text length and text start position:
1569 if (textId < (DocId)numberOfTexts - 1)
1571 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1572 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1573 posOfSuccEndmarker = x;
1576 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1577 p = textId; // Correct LF-mapping to the last char of the previous text.
1580 p = C[c]+alphabetrank_i_tmp-1;
1582 assert(textId == 0);
1584 sampled = new BSGAP(sampledpositions,n,true);
1585 sampleLength = sampled->rank(n-1);
1586 assert(sampleCount == sampleLength);
1588 // Suffixes store an offset from the text start position
1589 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1590 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1592 for(ulong i=0; i<sampleLength; i++) {
1593 assert((*positions)[i] < n);
1594 ulong j = sampled->rank((*positions)[i]);
1595 if (j==0) j=sampleLength;
1596 TextPosition textPos = (*tmpSuffix)[i];
1597 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1599 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1600 assert((*suffixDocId)[j-1] < numberOfTexts);
1601 // calculate offset from text start:
1602 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1604 // FIXME Temp, remove
1607 // delete textLength;
1608 delete textStartPos;
1613 * Finds document identifier for given text position
1615 * Starting text position of the document is stored into second parameter.
1616 * Binary searching on text starting positions.
1618 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1623 DocId b = numberOfTexts - 1;
1626 DocId c = a + (b - a)/2;
1627 if ((*textStartPos)[c] > i)
1629 else if ((*textStartPos)[c+1] > i)
1635 assert(a < (DocId)numberOfTexts);
1636 assert(i >= (*textStartPos)[a]);
1637 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1641 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1643 TCodeEntry *result = new TCodeEntry[ 256 ];
1645 count_chars( text, n, result );
1646 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1648 // First I push all the leaf nodes into the queue
1650 for ( unsigned int i = 0 ; i < 256 ; i++ )
1651 if ( result[ i ].count )
1652 q.push(node( i, result[ i ].count ) );
1654 // This loop removes the two smallest nodes from the
1655 // queue. It creates a new internal node that has
1656 // those two nodes as children. The new internal node
1657 // is then inserted into the priority queue. When there
1658 // is only one node in the priority queue, the tree
1662 while ( q.size() > 1 ) {
1663 node *child0 = new node( q.top() );
1665 node *child1 = new node( q.top() );
1667 q.push( node( child0, child1 ) );
1670 // Now I compute and return the codetable
1672 q.top().maketable(0u,0u, result);
1678 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1682 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1683 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1689 codetable[value].code = code;
1690 codetable[value].bits = bits;
1694 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1697 for (i = 0 ; i < 256 ; i++ )
1698 counts[ i ].count = 0;
1700 counts[(int)text[i]].count++;
1703 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1704 return x | (bit << pos);