1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
23 // Conflicts with std::min and std::max
26 #include "dcover/bwt.hpp"
28 //#include "HeapProfiler.h" // FIXME remove
37 #include <cstring> // For strlen()
46 // Save file version info
47 const uchar CSA::versionFlag = 2;
49 ////////////////////////////////////////////////////////////////////////////
50 // Class CSA::THuffAlphabetRank
52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
58 this->codetable = codetable;
60 bool *B = new bool[n];
63 for (i=0; i< n; i++) {
64 printf("%c:", (char)((int)s[i]-128));
65 for (r=0;r<codetable[(int)s[i]].bits;r++)
66 if (codetable[(int)s[i]].code & (1u <<r))
72 if (level > 100) return;*/
74 if (codetable[(int)s[i]].code & (1u << level)) {
79 if (sum==0 || sum==n) {
84 uchar *sfirst, *ssecond;
86 sfirst = new uchar[n-sum];
88 ssecond = new uchar[sum];
91 if (B[i]) ssecond[k++] = s[i];
92 else sfirst[j++] = s[i];
93 ulong *Binbits = new ulong[n/W+1];
95 Tools::SetField(Binbits,1,i,B[i]);
97 bitrank = new BitRank(Binbits,n,true);
99 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
103 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
110 // testing that the code works correctly
118 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
120 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
127 if (left!=NULL) delete left;
128 if (right!=NULL) delete right;
134 ////////////////////////////////////////////////////////////////////////////
138 * Constructor inits an empty dynamic FM-index.
139 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
141 CSA::CSA(unsigned samplerate)
142 : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
143 suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
146 this->samplerate = samplerate;
149 // FIXME TODO : DynFMI needs distribution of characters before hand
150 // This will create fully balanced wavelet tree for all chars [0, 255].
152 for (unsigned i = 0; i < 255; ++i)
155 dynFMI = new DynFMI(temp, 1, 255, false);
157 /* Debug code: take char distribution from data.txt.
158 uchar *temp = Tools::GetFileContents("data.txt", 0);
159 dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
167 * Given text must include end-marker.
168 * Text identifiers are numbered starting from 0.
170 void CSA::InsertText(uchar const * text)
177 TextPosition m = std::strlen((char *)text) + 1;
178 if (m > maxTextLength)
179 maxTextLength = m; // Store length of the longest text seen so far.
184 this->numberOfTexts ++;
185 this->numberOfAllTexts ++;
187 dynFMI->addText(text, m);
190 texts.append((char *) text);
191 texts.append(1, '\0');
195 emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
196 this->numberOfAllTexts ++;
200 void CSA::MakeStatic()
205 throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
208 if (texts.size() == 0) // Empty collection
212 texts.append(1, '\0');
215 // Bitvector of empty texts, FIXME Remove?
217 //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
218 ulong *tempB = new ulong[numberOfAllTexts/W + 1];
219 for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
221 for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
222 Tools::SetField(tempB, 1, (*it), 1);
224 emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
227 texts.reserve(0); // Release extra capacity
229 /* FILE *fp = fopen("texts.txt", "wb");
230 std::cout << "Wrote " << fwrite(texts.c_str(), 1, n, fp) << " bytes into texts.txt." << std::endl;
233 uchar *bwt = new uchar[n];
235 // More succinct solution via StringIterator, see below.
236 /* unsigned* maptexts = new unsigned[n+1];
237 // Map text chars to [0..255+numberOfTexts]
239 for (ulong i = 0; i < n; ++i)
241 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
243 uchar c = (uchar)texts[i];
244 maptexts[i] = (unsigned)c + numberOfTexts;
247 assert(count == numberOfTexts);
249 std::cout << "maptext: ";
250 for (ulong i = 0; i <= n; ++i)
251 std::cout << maptexts[i] << ", ";
252 std::cout << std::endl;*/
254 // Mark endmarker positions into bitvector
255 ulong * endmarkers = new ulong[n/W+1];
256 for (ulong i = 0; i < n/W+1; ++i)
258 for (ulong i = 0; i < n; ++i)
260 Tools::SetField(endmarkers, 1, i, 1);
261 BitRank* br = new BitRank(endmarkers, n, true);
262 assert(numberOfTexts == br->rank(n-1));
264 // Build iterators, FIXME clean up iterator construction.
265 StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
266 StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
268 bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
269 &bwt[0], numberOfTexts);
271 bwt[--bwtEndPos] = '\0';
273 texts.reserve(0); // Release capacity
274 delete br; // bitvector of endmarkers
275 } // End of bw transform
276 // std::cerr << "Time after BWT: " << Tools::GetTime() << std::endl;
278 /* fp = fopen("bwt.txt", "wb");
279 std::cout << "Wrote " << fwrite(bwt, 1, n, fp) << " bytes into bwt.txt." << std::endl;
285 uchar *bwtTest = dynFMI->getBWT();
286 printf("123456789012345678901234567890123456789\n");
287 for (TextPosition i = 0; i < n && i < 100; i ++)
289 printf("%d", (int)bwt[i]);
291 printf("%c", bwt[i]);
293 for (TextPosition i = 0; i < n && i < 100; i ++)
295 printf("%d", (int)bwtTest[i]);
297 printf("%c", bwtTest[i]);
301 assert(numberOfTexts == dynFMI->getCollectionSize());
305 for (ulong i = 0; i < n; ++i)
306 if (bwt[i] != bwtTest[i])
308 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
313 #endif // CSA_TEST_BWT
315 makewavelet(bwt); // Deletes bwt!
318 // Make sampling tables
322 bool CSA::EmptyText(DocId k) const
324 assert(k < (DocId)numberOfTexts);
325 if (emptyTextRank->IsBitSet(k))
330 uchar* CSA::GetText(DocId k) const
332 assert(k < (DocId)numberOfTexts);
335 if (emptyTextRank->IsBitSet(k, &textRank))
337 uchar* result = new uchar[1];
341 // Map to non-empty text
342 k -= textRank; //emptyTextRank->rank(k);
347 // Reserve average string length to avoid reallocs
348 result.reserve(n/numberOfTexts);
350 uchar c = alphabetrank->access(i);
354 i = C[c]+alphabetrank->rank(c,i)-1;
356 c = alphabetrank->access(i); // "next" char.
359 // Convert to uchar (FIXME return string?)
361 uchar* res = new uchar[i+1];
363 for (ulong j = 0; j < i; ++j)
364 res[i-j-1] = result[j];
369 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
371 assert(k < (DocId)numberOfTexts);
372 assert(j < (*textLength)[k]);
376 if (emptyTextRank->IsBitSet(k, &textRank))
378 uchar* result = new uchar[1];
383 // Map to non-empty text
384 k -= textRank; // emptyTextRank->rank(k);
386 // Start position of k'th text
387 ulong start = (*textStartPos)[k];
389 return Substring(i + start, j-i+1);
392 /******************************************************************
393 * Existential queries
395 bool CSA::IsPrefix(uchar const * pattern) const
397 TextPosition m = strlen((char *)pattern);
401 TextPosition sp = 0, ep = 0;
402 Search(pattern, m, &sp, &ep);
404 // Check for end-marker(s) in result interval
405 if (CountEndmarkers(sp, ep))
410 bool CSA::IsPrefix(uchar const * pattern, DocId begin, DocId end) const
412 TextPosition m = strlen((char *)pattern);
416 TextPosition sp = 0, ep = 0;
417 Search(pattern, m, &sp, &ep);
419 // Check for end-marker(s) in result interval
420 if (CountEndmarkers(sp, ep, begin, end))
426 bool CSA::IsSuffix(uchar const *pattern) const
428 // Here counting is as fast as IsSuffix():
429 if (CountSuffix(pattern) > 0)
434 bool CSA::IsSuffix(uchar const *pattern, DocId begin, DocId end) const
436 // Here counting is as fast as IsSuffix():
437 if (CountSuffix(pattern, begin, end) > 0)
442 bool CSA::IsEqual(uchar const *pattern) const
444 TextPosition m = std::strlen((char *)pattern);
447 if (numberOfAllTexts - numberOfTexts > 0u)
448 return true; // Empty texts exists
449 return false; // No empty texts exists
452 TextPosition sp = 0, ep = 0;
453 // Match including end-marker
454 Search(pattern, m+1, &sp, &ep);
456 // Check for end-marker(s) in result interval
457 if (CountEndmarkers(sp, ep))
462 bool CSA::IsEqual(uchar const *pattern, DocId begin, DocId end) const
464 TextPosition m = std::strlen((char *)pattern);
467 if (numberOfAllTexts - numberOfTexts > 0u)
468 return true; // Empty texts exists
469 return false; // No empty texts exists
472 TextPosition sp = 0, ep = 0;
473 // Match including end-marker
474 Search(pattern, m+1, &sp, &ep, begin, end);
476 // Check for end-marker(s) in result interval
477 if (CountEndmarkers(sp, ep))
482 bool CSA::IsContains(uchar const * pattern) const
484 TextPosition m = strlen((char *)pattern);
488 TextPosition sp = 0, ep = 0;
489 // Just check if pattern exists somewhere
490 ulong count = Search(pattern, m, &sp, &ep);
497 bool CSA::IsContains(uchar const * pattern, DocId begin, DocId end) const
499 // Here counting is as fast as existential querying
500 if (CountContains(pattern, begin, end) > 0)
501 return true; // FIXME No need to filter result set
505 bool CSA::IsLessThan(uchar const * pattern) const
507 if (CountLessThan(pattern) > 0)
512 bool CSA::IsLessThan(uchar const * pattern, DocId begin, DocId end) const
514 if (CountLessThan(pattern, begin, end) > 0)
519 /******************************************************************
522 ulong CSA::Count(uchar const * pattern) const
524 TextPosition m = strlen((char *)pattern);
528 TextPosition sp = 0, ep = 0;
529 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
533 unsigned CSA::CountPrefix(uchar const * pattern) const
535 TextPosition m = strlen((char *)pattern);
537 return numberOfAllTexts;
539 TextPosition sp = 0, ep = 0;
540 Search(pattern, m, &sp, &ep);
542 // Count end-markers in result interval
543 return CountEndmarkers(sp, ep);
546 unsigned CSA::CountPrefix(uchar const * pattern, DocId begin, DocId end) const
548 TextPosition m = strlen((char *)pattern);
550 return numberOfAllTexts;
552 TextPosition sp = 0, ep = 0;
553 Search(pattern, m, &sp, &ep);
555 // Count end-markers in result interval
556 return CountEndmarkers(sp, ep, begin, end);
559 unsigned CSA::CountSuffix(uchar const * pattern) const
561 TextPosition m = strlen((char *)pattern);
563 return numberOfAllTexts;
565 TextPosition sp = 0, ep = 0;
566 // Search with end-marker
567 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
572 unsigned CSA::CountSuffix(uchar const * pattern, DocId begin, DocId end) const
574 TextPosition m = strlen((char *)pattern);
576 return numberOfAllTexts;
578 TextPosition sp = 0, ep = 0;
579 // Search with end-marker
580 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep, begin, end);
585 unsigned CSA::CountEqual(uchar const *pattern) const
587 TextPosition m = strlen((char const *)pattern);
589 return numberOfAllTexts - numberOfTexts; // Empty texts.
591 TextPosition sp = 0, ep = 0;
592 // Match including end-marker
593 Search(pattern, m+1, &sp, &ep);
595 // Count end-markers in result interval
596 return CountEndmarkers(sp, ep);
599 unsigned CSA::CountEqual(uchar const *pattern, DocId begin, DocId end) const
601 TextPosition m = strlen((char const *)pattern);
603 return numberOfAllTexts - numberOfTexts; // Empty texts.
605 TextPosition sp = 0, ep = 0;
606 // Match including end-marker
607 Search(pattern, m+1, &sp, &ep, begin, end);
609 // Count end-markers in result interval
610 return CountEndmarkers(sp, ep); // Already within [begin, end]
613 unsigned CSA::CountContains(uchar const * pattern) const
615 TextPosition m = strlen((char const *)pattern);
617 return numberOfAllTexts; // Total number of texts.
619 // Here counting is as slow as fetching the result set
620 // because we have to filter out occ's that fall within same document.
621 TextCollection::document_result result = Contains(pattern);
622 return result.size();
625 unsigned CSA::CountContains(uchar const * pattern, DocId begin, DocId end) const
627 TextPosition m = strlen((char const *)pattern);
629 return numberOfAllTexts; // Total number of texts.
631 // Here counting is as slow as fetching the result set
632 // because we have to filter out occ's that fall within same document.
633 TextCollection::document_result result = Contains(pattern, begin, end);
634 return result.size();
637 // Less than or equal
638 unsigned CSA::CountLessThan(uchar const * pattern) const
640 TextPosition m = strlen((char const *)pattern);
642 return numberOfAllTexts - numberOfTexts; // Empty texts.
644 TextPosition sp = 0, ep = 0;
645 SearchLessThan(pattern, m, &sp, &ep);
647 // Count end-markers in result interval
648 return CountEndmarkers(sp, ep);
651 unsigned CSA::CountLessThan(uchar const * pattern, DocId begin, DocId end) const
653 TextPosition m = strlen((char const *)pattern);
655 return numberOfAllTexts - numberOfTexts; // Empty texts.
657 TextPosition sp = 0, ep = 0;
658 SearchLessThan(pattern, m, &sp, &ep);
660 // Count end-markers in result interval
661 return CountEndmarkers(sp, ep, begin, end);
665 * Document reporting queries
667 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
669 TextPosition m = strlen((char *)pattern);
671 return TextCollection::document_result(); // FIXME Should return all 1...k
673 TextPosition sp = 0, ep = 0;
674 Search(pattern, m, &sp, &ep);
676 TextCollection::document_result result;
678 // Report end-markers in result interval
679 unsigned resultSize = CountEndmarkers(sp, ep);
683 result.reserve(resultSize); // Try to avoid reallocation.
685 // Iterate through end-markers in [sp,ep]:
688 i = alphabetrank->rank(0, sp - 1);
691 // End-marker we found belongs to the "preceeding" doc in the collection
692 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
694 docId = emptyTextRank->select0(docId+1);
695 result.push_back(docId);
704 TextCollection::document_result CSA::Prefix(uchar const * pattern, DocId begin, DocId end) const
706 TextPosition m = strlen((char *)pattern);
708 return TextCollection::document_result(); // FIXME Should return all 1...k
710 TextPosition sp = 0, ep = 0;
711 Search(pattern, m, &sp, &ep);
713 TextCollection::document_result result;
715 // Report end-markers in result interval
716 unsigned resultSize = CountEndmarkers(sp, ep);
720 result.reserve(resultSize); // Try to avoid reallocation.
722 // Iterate through end-markers in [sp,ep] and [begin, end]:
725 i = alphabetrank->rank(0, sp - 1);
728 // End-marker we found belongs to the "preceeding" doc in the collection
729 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
731 docId = emptyTextRank->select0(docId+1);
732 if (docId >= begin && docId <= end)
733 result.push_back(docId);
742 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
744 TextPosition m = strlen((char *)pattern);
746 return TextCollection::document_result(); // FIXME Should return all 1...k
748 TextPosition sp = 0, ep = 0;
749 // Search with end-marker
750 Search(pattern, m+1, &sp, &ep);
752 TextCollection::document_result result;
753 result.reserve(ep-sp+1); // Try to avoid reallocation.
755 ulong sampled_rank_i = 0;
756 // Check each occurrence
757 for (; sp <= ep; ++sp)
761 uchar c = alphabetrank->access(i);
762 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
764 i = C[c]+alphabetrank->rank(c,i)-1;
765 c = alphabetrank->access(i);
767 // Assert: c == '\0' OR sampled->IsBitSet(i)
771 // Rank among the end-markers in BWT
772 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
774 // End-marker that we found belongs to the "preceeding" doc in collection:
775 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
777 docId = emptyTextRank->select0(docId+1);
778 result.push_back(docId);
780 else // Sampled position
782 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
784 docId = emptyTextRank->select0(docId+1);
785 result.push_back(docId);
792 TextCollection::document_result CSA::Suffix(uchar const * pattern, DocId begin, DocId end) const
794 TextPosition m = strlen((char *)pattern);
796 return TextCollection::document_result(); // FIXME Should return all 1...k
798 TextPosition sp = 0, ep = 0;
799 // Search with end-marker
800 Search(pattern, m+1, &sp, &ep, begin, end);
802 TextCollection::document_result result;
803 result.reserve(ep-sp+1); // Try to avoid reallocation.
805 ulong sampled_rank_i = 0;
806 // Check each occurrence, already within [begin, end]
807 for (; sp <= ep; ++sp)
811 uchar c = alphabetrank->access(i);
812 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
814 i = C[c]+alphabetrank->rank(c,i)-1;
815 c = alphabetrank->access(i);
817 // Assert: c == '\0' OR sampled->IsBitSet(i)
821 // Rank among the end-markers in BWT
822 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
824 // End-marker that we found belongs to the "preceeding" doc in collection:
825 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
827 docId = emptyTextRank->select0(docId+1);
828 result.push_back(docId);
830 else // Sampled position
832 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
834 docId = emptyTextRank->select0(docId+1);
835 result.push_back(docId);
843 TextCollection::document_result CSA::Equal(uchar const *pattern) const
845 TextPosition m = strlen((char const *)pattern);
847 return TextCollection::document_result(); // FIXME Should return all empty texts
849 TextPosition sp = 0, ep = 0;
850 // Match including end-marker
851 Search(pattern, m+1, &sp, &ep);
853 TextCollection::document_result result;
855 // Report end-markers in result interval
856 unsigned resultSize = CountEndmarkers(sp, ep);
860 result.reserve(resultSize); // Try to avoid reallocation.
864 i = alphabetrank->rank(0, sp - 1);
867 // End-marker we found belongs to the "previous" doc in the collection
868 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
870 docId = emptyTextRank->select0(docId+1);
871 result.push_back(docId);
880 TextCollection::document_result CSA::Equal(uchar const *pattern, DocId begin, DocId end) const
882 TextPosition m = strlen((char const *)pattern);
884 return TextCollection::document_result(); // FIXME Should return all empty texts
886 TextPosition sp = 0, ep = 0;
887 // Match including end-marker
888 Search(pattern, m+1, &sp, &ep, begin, end);
890 TextCollection::document_result result;
892 // Report end-markers in result interval
893 unsigned resultSize = CountEndmarkers(sp, ep);
897 result.reserve(resultSize); // Try to avoid reallocation.
901 i = alphabetrank->rank(0, sp - 1);
904 // End-marker we found belongs to the "previous" doc in the collection
905 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
907 docId = emptyTextRank->select0(docId+1);
908 result.push_back(docId); // already within [begin, end]
918 TextCollection::document_result CSA::Contains(uchar const * pattern) const
920 TextPosition m = strlen((char *)pattern);
922 return TextCollection::document_result();
924 TextPosition sp = 0, ep = 0;
925 // Search all occurrences
926 Search(pattern, m, &sp, &ep);
928 // We want unique document indentifiers, using std::set to collect them
929 std::set<DocId> resultSet;
931 ulong sampled_rank_i = 0;
932 // Check each occurrence
933 for (; sp <= ep; ++sp)
936 uchar c = alphabetrank->access(i);
937 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
939 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
940 c = alphabetrank->access(i);
944 // Rank among the end-markers in BWT
945 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
947 // End-marker that we found belongs to the "preceeding" doc in collection:
948 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
949 resultSet.insert(docId);
953 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
954 assert((unsigned)di < numberOfTexts);
955 resultSet.insert(di);
959 // Convert std::set to std::vector
960 TextCollection::document_result result(resultSet.begin(), resultSet.end());
962 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
963 *it = emptyTextRank->select0(*it+1);
967 TextCollection::document_result CSA::Contains(uchar const * pattern, DocId begin, DocId end) const
969 TextPosition m = strlen((char *)pattern);
971 return TextCollection::document_result();
973 TextPosition sp = 0, ep = 0;
974 // Search all occurrences
975 Search(pattern, m, &sp, &ep);
977 // We want unique document indentifiers, using std::set to collect them
978 std::set<DocId> resultSet;
980 ulong sampled_rank_i = 0;
981 // Check each occurrence
982 for (; sp <= ep; ++sp)
985 uchar c = alphabetrank->access(i);
986 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
988 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
989 c = alphabetrank->access(i);
993 // Rank among the end-markers in BWT
994 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
996 // End-marker that we found belongs to the "preceeding" doc in collection:
997 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
998 if (docId >= begin && docId <= end)
999 resultSet.insert(docId);
1003 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1004 assert((unsigned)docId < numberOfTexts);
1005 if (docId >= begin && docId <= end)
1006 resultSet.insert(docId);
1010 // Convert std::set to std::vector
1011 TextCollection::document_result result(resultSet.begin(), resultSet.end());
1013 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
1014 *it = emptyTextRank->select0(*it+1);
1018 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
1020 TextPosition m = strlen((char *)pattern);
1022 return TextCollection::document_result(); // empty result set
1024 TextPosition sp = 0, ep = 0;
1025 SearchLessThan(pattern, m, &sp, &ep);
1027 TextCollection::document_result result;
1029 // Report end-markers in result interval
1030 unsigned resultSize = CountEndmarkers(sp, ep);
1031 if (resultSize == 0)
1034 result.reserve(resultSize); // Try to avoid reallocation.
1036 // Iterate through end-markers in [sp,ep]:
1039 i = alphabetrank->rank(0, sp - 1);
1042 // End-marker we found belongs to the "preceeding" doc in the collection
1043 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1045 docId = emptyTextRank->select0(docId+1);
1046 result.push_back(docId);
1055 TextCollection::document_result CSA::LessThan(uchar const * pattern, DocId begin, DocId end) const
1057 TextPosition m = strlen((char *)pattern);
1059 return TextCollection::document_result(); // empty result set
1061 TextPosition sp = 0, ep = 0;
1062 SearchLessThan(pattern, m, &sp, &ep);
1064 TextCollection::document_result result;
1066 // Report end-markers in result interval
1067 unsigned resultSize = CountEndmarkers(sp, ep);
1068 if (resultSize == 0)
1071 result.reserve(resultSize); // Try to avoid reallocation.
1073 // Iterate through end-markers in [sp,ep] and [begin, end]:
1076 i = alphabetrank->rank(0, sp - 1);
1079 // End-marker we found belongs to the "preceeding" doc in the collection
1080 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1082 docId = emptyTextRank->select0(docId+1);
1083 if (docId >= begin && docId <= end)
1084 result.push_back(docId);
1094 * Full result set queries
1096 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
1098 TextPosition m = strlen((char *)pattern);
1100 return full_result(); // FIXME Throw exception?
1102 TextPosition sp = 0, ep = 0;
1103 // Search all occurrences
1104 Search(pattern, m, &sp, &ep);
1107 result.reserve(ep-sp+1); // Try to avoid reallocation.
1109 ulong sampled_rank_i = 0;
1110 // Report each occurrence
1111 for (; sp <= ep; ++sp)
1113 TextPosition i = sp;
1114 TextPosition dist = 0;
1115 uchar c = alphabetrank->access(i);
1116 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1118 i = C[c]+alphabetrank->rank(c,i)-1;
1119 c = alphabetrank->access(i);
1124 // Rank among the end-markers in BWT
1125 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1127 // End-marker that we found belongs to the "preceeding" doc in collection:
1128 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1130 docId = emptyTextRank->select0(docId+1);
1131 result.push_back(make_pair(docId, dist));
1135 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1136 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1137 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1140 docId = emptyTextRank->select0(docId+1);
1141 result.push_back(make_pair(docId, textPos));
1148 TextCollection::full_result CSA::FullContains(uchar const * pattern, DocId begin, DocId end) const
1150 TextPosition m = strlen((char *)pattern);
1152 return full_result(); // FIXME Throw exception?
1154 TextPosition sp = 0, ep = 0;
1155 // Search all occurrences
1156 Search(pattern, m, &sp, &ep);
1159 result.reserve(ep-sp+1); // Try to avoid reallocation.
1161 ulong sampled_rank_i = 0;
1162 // Report each occurrence
1163 for (; sp <= ep; ++sp)
1165 TextPosition i = sp;
1166 TextPosition dist = 0;
1167 uchar c = alphabetrank->access(i);
1168 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
1170 i = C[c]+alphabetrank->rank(c,i)-1;
1171 c = alphabetrank->access(i);
1176 // Rank among the end-markers in BWT
1177 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
1179 // End-marker that we found belongs to the "preceeding" doc in collection:
1180 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
1182 docId = emptyTextRank->select0(docId+1);
1183 if (docId >= begin && docId <= end)
1184 result.push_back(make_pair(docId, dist));
1188 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
1189 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
1190 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
1193 docId = emptyTextRank->select0(docId+1);
1194 if (docId >= begin && docId <= end)
1195 result.push_back(make_pair(docId, textPos));
1204 * Save index to a file handle
1206 * Throws a std::runtime_error exception on i/o error.
1207 * First byte that is saved represents the version number of the save file.
1208 * In version 2 files, the data fields are:
1209 * uchar versionFlag;
1211 unsigned samplerate;
1213 TextPosition bwtEndPos;
1214 static_sequence * alphabetrank;
1216 BlockArray *suffixes;
1217 BlockArray *suffixDocId;
1218 unsigned numberOfTexts;
1219 unsigned numberOfAllTexts;
1220 ulong maxTextLength;
1221 BlockArray *endmarkerDocId;
1222 BSGAP *emptyTextRank;
1224 void CSA::Save(FILE *file) const
1226 // Saving version info:
1227 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
1228 throw std::runtime_error("CSA::Save(): file write error (version flag).");
1230 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
1231 throw std::runtime_error("CSA::Save(): file write error (n).");
1232 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
1233 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
1235 for(ulong i = 0; i < 256; ++i)
1236 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
1237 throw std::runtime_error("CSA::Save(): file write error (C table).");
1239 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1240 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
1242 alphabetrank->save(file);
1243 sampled->Save(file);
1244 suffixes->Save(file);
1245 suffixDocId->Save(file);
1247 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1248 throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
1249 if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1250 throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
1251 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1252 throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
1254 endmarkerDocId->Save(file);
1255 emptyTextRank->Save(file);
1261 * Load index from a file handle
1263 * Throws a std::runtime_error exception on i/o error.
1264 * For more info, see CSA::Save().
1266 void CSA::Load(FILE *file, unsigned samplerate)
1268 // Delete everything
1270 delete dynFMI; dynFMI = 0;
1272 delete alphabetrank; alphabetrank = 0;
1273 delete sampled; sampled = 0;
1274 delete suffixes; suffixes = 0;
1275 delete suffixDocId; suffixDocId = 0;
1276 delete [] codetable; codetable = 0;
1278 delete endmarkerDocId; endmarkerDocId = 0;
1279 delete emptyTextRank; emptyTextRank = 0;
1281 this->maxTextLength = 0;
1282 this->numberOfTexts = 0;
1283 this->numberOfAllTexts = 0;
1284 this->samplerate = samplerate;
1288 if (std::fread(&verFlag, 1, 1, file) != 1)
1289 throw std::runtime_error("CSA::Load(): file read error (version flag).");
1290 if (verFlag != CSA::versionFlag)
1291 throw std::runtime_error("CSA::Load(): invalid save file version.");
1293 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
1294 throw std::runtime_error("CSA::Load(): file read error (n).");
1295 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
1296 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
1297 // FIXME samplerate can not be changed during load.
1298 // if (this->samplerate == 0)
1299 // this->samplerate = samplerate;
1301 for(ulong i = 0; i < 256; ++i)
1302 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
1303 throw std::runtime_error("CSA::Load(): file read error (C table).");
1305 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
1306 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
1308 alphabetrank = static_sequence::load(file);
1309 sampled = new BSGAP(file);
1310 suffixes = new BlockArray(file);
1311 suffixDocId = new BlockArray(file);
1313 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
1314 throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
1315 if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
1316 throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
1317 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
1318 throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
1320 endmarkerDocId = new BlockArray(file);
1321 emptyTextRank = new BSGAP(file);
1323 // FIXME Construct data structures with new samplerate
1330 * Rest of the functions follow...
1334 // FIXME Use 2D-search structure
1335 unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep, DocId begin, DocId end) const
1342 ranksp = alphabetrank->rank(0, sp - 1);
1344 unsigned resultSize = alphabetrank->rank(0, ep) - ranksp;
1345 if (resultSize == 0)
1348 // Count end-markers in result interval and within [begin, end]
1350 unsigned i = ranksp;
1353 // End-marker we found belongs to the "previous" doc in the collection
1354 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
1356 docId = emptyTextRank->select0(docId+1);
1357 if (docId >= begin && docId <= end)
1366 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1368 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1369 int c = (int)pattern[m-1];
1371 TextPosition sp = C[c];
1372 TextPosition ep = C[c+1]-1;
1373 while (sp<=ep && i>=1)
1375 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1376 c = (int)pattern[--i];
1377 sp = C[c]+alphabetrank->rank(c,sp-1);
1378 ep = C[c]+alphabetrank->rank(c,ep)-1;
1388 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult, DocId begin, DocId end) const
1390 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1391 int c = (int)pattern[m-1];
1392 assert(c == 0); // Start from endmarkers
1394 TextPosition sp = begin;
1395 TextPosition ep = end;
1396 while (sp<=ep && i>=1)
1398 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1399 c = (int)pattern[--i];
1400 sp = C[c]+alphabetrank->rank(c,sp-1);
1401 ep = C[c]+alphabetrank->rank(c,ep)-1;
1412 ulong CSA::SearchLessThan(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
1414 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
1415 uint c = (int)pattern[m-1];
1417 TextPosition sp = 1;
1418 TextPosition ep = C[c+1]-1;
1419 while (sp<=ep && i>=1)
1421 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
1422 c = (int)pattern[--i];
1423 uint result = alphabetrank->rank(c,ep);
1442 delete alphabetrank;
1446 delete [] codetable; // FIXME remove
1448 delete endmarkerDocId;
1449 delete emptyTextRank;
1452 void CSA::makewavelet(uchar *bwt)
1461 if (C[i]>0) {min = i; break;}
1462 for (i=255;i>=min;--i)
1463 if (C[i]>0) {max = i; break;}
1465 ulong prev=C[0], temp;
1467 for (i=1;i<256;i++) {
1472 // this->codetable = node::makecodetable(bwt,n);
1473 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1475 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1476 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1477 // std::cerr << "max heap usage before WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1479 // HeapProfiler::ResetMaxHeapConsumption(); // FIXME remove
1481 alphabet_mapper * am = new alphabet_mapper_none();
1482 static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(8); // FIXME samplerate?
1483 // static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1484 wt_coder * wtc = new wt_coder_binary(bwt,n,am);
1485 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
1487 bwt = 0; // already deleted
1489 // std::cerr << "heap usage: " << HeapProfiler::GetHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1490 // std::cerr << "max heap usage after WT: " << HeapProfiler::GetMaxHeapConsumption()/(1024*1024) << " Mbytes" << std::endl;
1493 void CSA::maketables()
1495 // Calculate BWT end-marker position (of last inserted text)
1496 // Note: bwtEndPos already set!
1497 // and the length of the first text (counter l):
1502 uint alphabetrank_i_tmp = 0;
1503 uchar c = alphabetrank->access(i, alphabetrank_i_tmp);
1506 i = C[c]+alphabetrank_i_tmp-1;
1508 c = alphabetrank->access(i, alphabetrank_i_tmp);
1511 if (i != bwtEndPos) // compare result
1513 std::cout << "i = " << i << ", bwtendpos = " << bwtEndPos << std::endl;
1519 // Build up arrays for text length and starting positions
1520 // FIXME Temp, remove
1521 //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
1522 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
1523 //(*textLength)[0] = l;
1524 (*textStartPos)[0] = 0;
1526 // Construct samples
1527 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1528 unsigned ceilLog2n = Tools::CeilLog2(n);
1529 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
1530 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1532 // Mapping from end-markers to doc ID's:
1533 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1535 ulong *sampledpositions = new ulong[n/W+1];
1536 for (ulong i=0;i<n/W+1;i++)
1537 sampledpositions[i]=0lu;
1539 ulong x,p=bwtEndPos;
1540 ulong sampleCount = 0;
1541 // Keeping track of text position of prev. end-marker seen
1542 ulong posOfSuccEndmarker = n;
1543 DocId textId = numberOfTexts;
1546 uint alphabetrank_i_tmp =0;
1549 for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1550 // i substitutes SA->GetPos(i)
1553 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1554 Tools::SetField(sampledpositions,1,p,1);
1555 (*positions)[sampleCount] = p;
1556 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1560 uchar c = alphabetrank->access(p, alphabetrank_i_tmp);
1565 // Record the order of end-markers in BWT:
1566 ulong endmarkerRank = alphabetrank_i_tmp - 1;
1567 (*endmarkerDocId)[endmarkerRank] = textId;
1569 // Store text length and text start position:
1570 if (textId < (DocId)numberOfTexts - 1)
1572 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1573 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1574 posOfSuccEndmarker = x;
1577 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1578 p = textId; // Correct LF-mapping to the last char of the previous text.
1581 p = C[c]+alphabetrank_i_tmp-1;
1583 assert(textId == 0);
1585 sampled = new BSGAP(sampledpositions,n,true);
1586 sampleLength = sampled->rank(n-1);
1587 assert(sampleCount == sampleLength);
1589 // Suffixes store an offset from the text start position
1590 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1591 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1593 for(ulong i=0; i<sampleLength; i++) {
1594 assert((*positions)[i] < n);
1595 ulong j = sampled->rank((*positions)[i]);
1596 if (j==0) j=sampleLength;
1597 TextPosition textPos = (*tmpSuffix)[i];
1598 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1600 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1601 assert((*suffixDocId)[j-1] < numberOfTexts);
1602 // calculate offset from text start:
1603 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1605 // FIXME Temp, remove
1608 // delete textLength;
1609 delete textStartPos;
1614 * Finds document identifier for given text position
1616 * Starting text position of the document is stored into second parameter.
1617 * Binary searching on text starting positions.
1619 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1624 DocId b = numberOfTexts - 1;
1627 DocId c = a + (b - a)/2;
1628 if ((*textStartPos)[c] > i)
1630 else if ((*textStartPos)[c+1] > i)
1636 assert(a < (DocId)numberOfTexts);
1637 assert(i >= (*textStartPos)[a]);
1638 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1642 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1644 TCodeEntry *result = new TCodeEntry[ 256 ];
1646 count_chars( text, n, result );
1647 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1649 // First I push all the leaf nodes into the queue
1651 for ( unsigned int i = 0 ; i < 256 ; i++ )
1652 if ( result[ i ].count )
1653 q.push(node( i, result[ i ].count ) );
1655 // This loop removes the two smallest nodes from the
1656 // queue. It creates a new internal node that has
1657 // those two nodes as children. The new internal node
1658 // is then inserted into the priority queue. When there
1659 // is only one node in the priority queue, the tree
1663 while ( q.size() > 1 ) {
1664 node *child0 = new node( q.top() );
1666 node *child1 = new node( q.top() );
1668 q.push( node( child0, child1 ) );
1671 // Now I compute and return the codetable
1673 q.top().maketable(0u,0u, result);
1679 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1683 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1684 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1690 codetable[value].code = code;
1691 codetable[value].bits = bits;
1695 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1698 for (i = 0 ; i < 256 ; i++ )
1699 counts[ i ].count = 0;
1701 counts[(int)text[i]].count++;
1704 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1705 return x | (bit << pos);