1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
23 // Conflicts with std::min and std::max
26 #include "dcover/bwt.hpp"
28 #include "HeapProfiler.h" // FIXME remove
37 #include <cstring> // For strlen()
46 // Save file version info
47 const uchar CSA::versionFlag = 2;
49 ////////////////////////////////////////////////////////////////////////////
50 // Class CSA::THuffAlphabetRank
52 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
58 this->codetable = codetable;
60 bool *B = new bool[n];
63 for (i=0; i< n; i++) {
64 printf("%c:", (char)((int)s[i]-128));
65 for (r=0;r<codetable[(int)s[i]].bits;r++)
66 if (codetable[(int)s[i]].code & (1u <<r))
72 if (level > 100) return;*/
74 if (codetable[(int)s[i]].code & (1u << level)) {
79 if (sum==0 || sum==n) {
84 uchar *sfirst, *ssecond;
86 sfirst = new uchar[n-sum];
88 ssecond = new uchar[sum];
91 if (B[i]) ssecond[k++] = s[i];
92 else sfirst[j++] = s[i];
93 ulong *Binbits = new ulong[n/W+1];
95 Tools::SetField(Binbits,1,i,B[i]);
97 bitrank = new BitRank(Binbits,n,true);
99 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
103 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
109 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
110 // testing that the code works correctly
118 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
120 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
126 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
127 if (left!=NULL) delete left;
128 if (right!=NULL) delete right;
134 ////////////////////////////////////////////////////////////////////////////
138 * Constructor inits an empty dynamic FM-index.
139 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
141 CSA::CSA(unsigned samplerate)
142 : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
143 suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
146 this->samplerate = samplerate;
149 // FIXME TODO : DynFMI needs distribution of characters before hand
150 // This will create fully balanced wavelet tree for all chars [0, 255].
152 for (unsigned i = 0; i < 255; ++i)
155 dynFMI = new DynFMI(temp, 1, 255, false);
157 /* Debug code: take char distribution from data.txt.
158 uchar *temp = Tools::GetFileContents("data.txt", 0);
159 dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
167 * Given text must include end-marker.
168 * Text identifiers are numbered starting from 0.
170 void CSA::InsertText(uchar const * text)
177 TextPosition m = std::strlen((char *)text) + 1;
178 if (m > maxTextLength)
179 maxTextLength = m; // Store length of the longest text seen so far.
184 this->numberOfTexts ++;
185 this->numberOfAllTexts ++;
187 dynFMI->addText(text, m);
190 texts.append((char *) text);
191 texts.append(1, '\0');
195 emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
196 this->numberOfAllTexts ++;
200 void CSA::MakeStatic()
205 throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
208 // Bitvector of empty texts, FIXME Remove?
210 //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
211 ulong *tempB = new ulong[numberOfAllTexts/W + 1];
212 for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
214 for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
215 Tools::SetField(tempB, 1, (*it), 1);
217 emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
220 uchar *bwt = new uchar[n];
222 // More succinct solution via StringIterator, see below.
223 /* unsigned* maptexts = new unsigned[n+1];
224 // Map text chars to [0..255+numberOfTexts]
226 for (ulong i = 0; i < n; ++i)
228 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
230 uchar c = (uchar)texts[i];
231 maptexts[i] = (unsigned)c + numberOfTexts;
234 assert(count == numberOfTexts);
236 std::cout << "maptext: ";
237 for (ulong i = 0; i <= n; ++i)
238 std::cout << maptexts[i] << ", ";
239 std::cout << std::endl;*/
241 // Mark endmarker positions into bitvector
242 ulong * endmarkers = new ulong[n/W+1];
243 for (ulong i = 0; i < n/W+1; ++i)
245 for (ulong i = 0; i < n; ++i)
247 Tools::SetField(endmarkers, 1, i, 1);
248 BitRank* br = new BitRank(endmarkers, n, true);
249 assert(numberOfTexts == br->rank(n-1));
251 // Build iterators, FIXME clean up iterator construction.
252 StringIterator itBegin((uchar const *)texts.c_str(), (uchar const *)texts.c_str(), n, numberOfTexts, br);
253 StringIterator itEnd((uchar const *)texts.c_str() + n,(uchar const *)texts.c_str(), n, numberOfTexts, br);
255 bwtEndPos = (ulong)compute_bwt(itBegin, itEnd, //&maptexts[0], &maptexts[n],
256 &bwt[0], numberOfTexts);
258 bwt[--bwtEndPos] = '\0';
260 texts.reserve(0); // Release capacity
261 delete br; // bitvector of endmarkers
262 } // End of bw transform
266 uchar *bwtTest = dynFMI->getBWT();
267 printf("123456789012345678901234567890123456789\n");
268 for (TextPosition i = 0; i < n && i < 100; i ++)
270 printf("%d", (int)bwt[i]);
272 printf("%c", bwt[i]);
274 for (TextPosition i = 0; i < n && i < 100; i ++)
276 printf("%d", (int)bwtTest[i]);
278 printf("%c", bwtTest[i]);
282 assert(numberOfTexts == dynFMI->getCollectionSize());
286 for (ulong i = 0; i < n; ++i)
287 if (bwt[i] != bwtTest[i])
289 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
294 #endif // CSA_TEST_BWT
296 makewavelet(bwt); // Deletes bwt!
299 // Make sampling tables
303 bool CSA::EmptyText(DocId k) const
305 assert(k < (DocId)numberOfTexts);
306 if (emptyTextRank->IsBitSet(k))
311 uchar* CSA::GetText(DocId k) const
313 assert(k < (DocId)numberOfTexts);
316 if (emptyTextRank->IsBitSet(k, &textRank))
318 uchar* result = new uchar[1];
322 // Map to non-empty text
323 k -= textRank; //emptyTextRank->rank(k);
328 // Reserve average string length to avoid reallocs
329 result.reserve(n/numberOfTexts);
331 uchar c = alphabetrank->access(i);
335 i = C[c]+alphabetrank->rank(c,i)-1;
337 c = alphabetrank->access(i); // "next" char.
340 // Convert to uchar (FIXME return string?)
342 uchar* res = new uchar[i+1];
344 for (ulong j = 0; j < i; ++j)
345 res[i-j-1] = result[j];
350 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
352 assert(k < (DocId)numberOfTexts);
353 assert(j < (*textLength)[k]);
357 if (emptyTextRank->IsBitSet(k, &textRank))
359 uchar* result = new uchar[1];
364 // Map to non-empty text
365 k -= textRank; // emptyTextRank->rank(k);
367 // Start position of k'th text
368 ulong start = (*textStartPos)[k];
370 return Substring(i + start, j-i+1);
373 /******************************************************************
374 * Existential queries
376 bool CSA::IsPrefix(uchar const * pattern) const
378 TextPosition m = strlen((char *)pattern);
382 TextPosition sp = 0, ep = 0;
383 Search(pattern, m, &sp, &ep);
385 // Check for end-marker(s) in result interval
386 if (CountEndmarkers(sp, ep))
391 bool CSA::IsSuffix(uchar const *pattern) const
393 // Here counting is as fast as IsSuffix():
394 if (CountSuffix(pattern) > 0)
399 bool CSA::IsEqual(uchar const *pattern) const
401 TextPosition m = std::strlen((char *)pattern);
404 if (numberOfAllTexts - numberOfTexts > 0u)
405 return true; // Empty texts exists
406 return false; // No empty texts exists
409 TextPosition sp = 0, ep = 0;
410 // Match including end-marker
411 Search(pattern, m+1, &sp, &ep);
413 // Check for end-marker(s) in result interval
414 if (CountEndmarkers(sp, ep))
419 bool CSA::IsContains(uchar const * pattern) const
421 TextPosition m = strlen((char *)pattern);
425 TextPosition sp = 0, ep = 0;
426 // Just check if pattern exists somewhere
427 ulong count = Search(pattern, m, &sp, &ep);
434 bool CSA::IsLessThan(uchar const*) const
441 /******************************************************************
444 ulong CSA::Count(uchar const * pattern) const
446 TextPosition m = strlen((char *)pattern);
450 TextPosition sp = 0, ep = 0;
451 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
455 unsigned CSA::CountPrefix(uchar const * pattern) const
457 TextPosition m = strlen((char *)pattern);
459 return numberOfAllTexts;
461 TextPosition sp = 0, ep = 0;
462 Search(pattern, m, &sp, &ep);
464 // Count end-markers in result interval
465 return CountEndmarkers(sp, ep);
468 unsigned CSA::CountSuffix(uchar const * pattern) const
470 TextPosition m = strlen((char *)pattern);
472 return numberOfAllTexts;
474 TextPosition sp = 0, ep = 0;
475 // Search with end-marker
476 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
481 unsigned CSA::CountEqual(uchar const *pattern) const
483 TextPosition m = strlen((char const *)pattern);
485 return numberOfAllTexts - numberOfTexts; // Empty texts.
487 TextPosition sp = 0, ep = 0;
488 // Match including end-marker
489 Search(pattern, m+1, &sp, &ep);
491 // Count end-markers in result interval
492 return CountEndmarkers(sp, ep);
495 unsigned CSA::CountContains(uchar const * pattern) const
497 TextPosition m = strlen((char const *)pattern);
499 return numberOfAllTexts; // Total number of texts.
501 // Here counting is as slow as fetching the result set
502 // because we would have to filter out occ's that fall within same document.
503 TextCollection::document_result result = Contains(pattern);
504 return result.size();
507 unsigned CSA::CountLessThan(uchar const * pattern) const
515 * Document reporting queries
517 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
519 TextPosition m = strlen((char *)pattern);
521 return TextCollection::document_result(); // FIXME Should return all 1...k
523 TextPosition sp = 0, ep = 0;
524 Search(pattern, m, &sp, &ep);
526 TextCollection::document_result result;
528 // Report end-markers in result interval
529 unsigned resultSize = CountEndmarkers(sp, ep);
533 result.reserve(resultSize); // Try to avoid reallocation.
535 // Iterate through end-markers in [sp,ep]:
538 i = alphabetrank->rank(0, sp - 1);
541 // End-marker we found belongs to the "preceeding" doc in the collection
542 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
544 docId = emptyTextRank->select0(docId+1);
545 result.push_back(docId);
554 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
556 TextPosition m = strlen((char *)pattern);
558 return TextCollection::document_result(); // FIXME Should return all 1...k
560 TextPosition sp = 0, ep = 0;
561 // Search with end-marker
562 Search(pattern, m+1, &sp, &ep);
564 TextCollection::document_result result;
565 result.reserve(ep-sp+1); // Try to avoid reallocation.
567 ulong sampled_rank_i = 0;
568 // Check each occurrence
569 for (; sp <= ep; ++sp)
573 uchar c = alphabetrank->access(i);
574 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
576 i = C[c]+alphabetrank->rank(c,i)-1;
577 c = alphabetrank->access(i);
579 // Assert: c == '\0' OR sampled->IsBitSet(i)
583 // Rank among the end-markers in BWT
584 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
586 // End-marker that we found belongs to the "preceeding" doc in collection:
587 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
589 docId = emptyTextRank->select0(docId+1);
590 result.push_back(docId);
592 else // Sampled position
594 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
596 docId = emptyTextRank->select0(docId+1);
597 result.push_back(docId);
604 TextCollection::document_result CSA::Equal(uchar const *pattern) const
606 TextPosition m = strlen((char const *)pattern);
608 return TextCollection::document_result(); // FIXME Should return all empty texts
610 TextPosition sp = 0, ep = 0;
611 // Match including end-marker
612 Search(pattern, m+1, &sp, &ep);
614 TextCollection::document_result result;
616 // Report end-markers in result interval
617 unsigned resultSize = CountEndmarkers(sp, ep);
621 result.reserve(resultSize); // Try to avoid reallocation.
625 i = alphabetrank->rank(0, sp - 1);
628 // End-marker we found belongs to the "previous" doc in the collection
629 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
631 docId = emptyTextRank->select0(docId+1);
632 result.push_back(docId);
642 TextCollection::document_result CSA::Contains(uchar const * pattern) const
644 TextPosition m = strlen((char *)pattern);
646 return TextCollection::document_result();
648 TextPosition sp = 0, ep = 0;
649 // Search all occurrences
650 Search(pattern, m, &sp, &ep);
652 // We want unique document indentifiers, using std::set to collect them
653 std::set<DocId> resultSet;
655 ulong sampled_rank_i = 0;
656 // Check each occurrence
657 for (; sp <= ep; ++sp)
660 uchar c = alphabetrank->access(i);
661 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
663 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
664 c = alphabetrank->access(i);
668 // Rank among the end-markers in BWT
669 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
671 // End-marker that we found belongs to the "preceeding" doc in collection:
672 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
673 resultSet.insert(docId);
677 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
678 assert((unsigned)di < numberOfTexts);
679 resultSet.insert(di);
683 // Convert std::set to std::vector
684 TextCollection::document_result result(resultSet.begin(), resultSet.end());
686 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
687 *it = emptyTextRank->select0(*it+1);
691 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
695 return document_result();
699 * Full result set queries
701 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
703 TextPosition m = strlen((char *)pattern);
705 return full_result(); // FIXME Throw exception?
707 TextPosition sp = 0, ep = 0;
708 // Search all occurrences
709 Search(pattern, m, &sp, &ep);
712 result.reserve(ep-sp+1); // Try to avoid reallocation.
714 ulong sampled_rank_i = 0;
715 // Report each occurrence
716 for (; sp <= ep; ++sp)
719 TextPosition dist = 0;
720 uchar c = alphabetrank->access(i);
721 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
723 i = C[c]+alphabetrank->rank(c,i)-1;
724 c = alphabetrank->access(i);
729 // Rank among the end-markers in BWT
730 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
732 // End-marker that we found belongs to the "preceeding" doc in collection:
733 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
735 docId = emptyTextRank->select0(docId+1);
736 result.push_back(make_pair(docId, dist));
740 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
741 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
742 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
745 docId = emptyTextRank->select0(docId+1);
746 result.push_back(make_pair(docId, textPos));
755 * Save index to a file handle
757 * Throws a std::runtime_error exception on i/o error.
758 * First byte that is saved represents the version number of the save file.
759 * In version 2 files, the data fields are:
764 TextPosition bwtEndPos;
765 static_sequence * alphabetrank;
767 BlockArray *suffixes;
768 BlockArray *suffixDocId;
769 unsigned numberOfTexts;
770 unsigned numberOfAllTexts;
772 BlockArray *endmarkerDocId;
773 BSGAP *emptyTextRank;
775 void CSA::Save(FILE *file) const
777 // Saving version info:
778 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
779 throw std::runtime_error("CSA::Save(): file write error (version flag).");
781 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
782 throw std::runtime_error("CSA::Save(): file write error (n).");
783 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
784 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
786 for(ulong i = 0; i < 256; ++i)
787 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
788 throw std::runtime_error("CSA::Save(): file write error (C table).");
790 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
791 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
793 alphabetrank->save(file);
795 suffixes->Save(file);
796 suffixDocId->Save(file);
798 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
799 throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
800 if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
801 throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
802 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
803 throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
805 endmarkerDocId->Save(file);
806 emptyTextRank->Save(file);
812 * Load index from a file handle
814 * Throws a std::runtime_error exception on i/o error.
815 * For more info, see CSA::Save().
817 void CSA::Load(FILE *file, unsigned samplerate)
821 delete dynFMI; dynFMI = 0;
823 delete alphabetrank; alphabetrank = 0;
824 delete sampled; sampled = 0;
825 delete suffixes; suffixes = 0;
826 delete suffixDocId; suffixDocId = 0;
827 delete [] codetable; codetable = 0;
829 delete endmarkerDocId; endmarkerDocId = 0;
830 delete emptyTextRank; emptyTextRank = 0;
832 this->maxTextLength = 0;
833 this->numberOfTexts = 0;
834 this->numberOfAllTexts = 0;
835 this->samplerate = samplerate;
839 if (std::fread(&verFlag, 1, 1, file) != 1)
840 throw std::runtime_error("CSA::Load(): file read error (version flag).");
841 if (verFlag != CSA::versionFlag)
842 throw std::runtime_error("CSA::Load(): invalid save file version.");
844 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
845 throw std::runtime_error("CSA::Load(): file read error (n).");
846 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
847 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
848 // FIXME samplerate can not be changed during load.
849 // if (this->samplerate == 0)
850 // this->samplerate = samplerate;
852 for(ulong i = 0; i < 256; ++i)
853 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
854 throw std::runtime_error("CSA::Load(): file read error (C table).");
856 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
857 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
859 alphabetrank = static_sequence::load(file);
860 sampled = new BSGAP(file);
861 suffixes = new BlockArray(file);
862 suffixDocId = new BlockArray(file);
864 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
865 throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
866 if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
867 throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
868 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
869 throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
871 endmarkerDocId = new BlockArray(file);
872 emptyTextRank = new BSGAP(file);
874 // FIXME Construct data structures with new samplerate
881 * Rest of the functions follow...
885 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
887 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
888 int c = (int)pattern[m-1];
890 TextPosition sp = C[c];
891 TextPosition ep = C[c+1]-1;
892 while (sp<=ep && i>=1)
894 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
895 c = (int)pattern[--i];
896 sp = C[c]+alphabetrank->rank(c,sp-1);
897 ep = C[c]+alphabetrank->rank(c,ep)-1;
915 delete [] codetable; // FIXME remove
917 delete endmarkerDocId;
918 delete emptyTextRank;
921 void CSA::makewavelet(uchar *bwt)
930 if (C[i]>0) {min = i; break;}
931 for (i=255;i>=min;--i)
932 if (C[i]>0) {max = i; break;}
934 ulong prev=C[0], temp;
936 for (i=1;i<256;i++) {
941 // this->codetable = node::makecodetable(bwt,n);
942 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
944 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
946 alphabet_mapper * am = new alphabet_mapper_none();
947 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
948 wt_coder * wtc = new wt_coder_huff(bwt,n,am);
949 alphabetrank = new static_sequence_wvtree(bwt,n,wtc,bmb,am);
954 void CSA::maketables()
956 // Calculate BWT end-marker position (of last inserted text)
957 // Note: bwtEndPos already set!
958 // and the length of the first text (counter l):
963 uchar c = alphabetrank->access(i);
966 i = C[c]+alphabetrank->rank(c, i)-1;
968 c = alphabetrank->access(i);
970 assert(i == bwtEndPos); // compare result
973 // Build up arrays for text length and starting positions
974 // FIXME Temp, remove
975 //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
976 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
977 //(*textLength)[0] = l;
978 (*textStartPos)[0] = 0;
981 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
982 unsigned ceilLog2n = Tools::CeilLog2(n);
983 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
984 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
986 // Mapping from end-markers to doc ID's:
987 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
989 ulong *sampledpositions = new ulong[n/W+1];
990 for (ulong i=0;i<n/W+1;i++)
991 sampledpositions[i]=0lu;
994 ulong sampleCount = 0;
995 // Keeping track of text position of prev. end-marker seen
996 ulong posOfSuccEndmarker = n;
997 DocId textId = numberOfTexts;
1002 for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1003 // i substitutes SA->GetPos(i)
1006 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1007 Tools::SetField(sampledpositions,1,p,1);
1008 (*positions)[sampleCount] = p;
1009 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1013 uchar c = alphabetrank->access(p);
1018 // Record the order of end-markers in BWT:
1019 ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1020 (*endmarkerDocId)[endmarkerRank] = textId;
1022 // Store text length and text start position:
1023 if (textId < (DocId)numberOfTexts - 1)
1025 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1026 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1027 posOfSuccEndmarker = x;
1030 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1031 p = textId; // Correct LF-mapping to the last char of the previous text.
1034 p = C[c]+alphabetrank->rank(c, p)-1;
1036 assert(textId == 0);
1038 sampled = new BSGAP(sampledpositions,n,true);
1039 sampleLength = sampled->rank(n-1);
1040 assert(sampleCount == sampleLength);
1042 // Suffixes store an offset from the text start position
1043 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1044 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1046 for(ulong i=0; i<sampleLength; i++) {
1047 assert((*positions)[i] < n);
1048 ulong j = sampled->rank((*positions)[i]);
1049 if (j==0) j=sampleLength;
1050 TextPosition textPos = (*tmpSuffix)[i];
1051 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1053 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1054 assert((*suffixDocId)[j-1] < numberOfTexts);
1055 // calculate offset from text start:
1056 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1058 // FIXME Temp, remove
1061 // delete textLength;
1062 delete textStartPos;
1067 * Finds document identifier for given text position
1069 * Starting text position of the document is stored into second parameter.
1070 * Binary searching on text starting positions.
1072 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1077 DocId b = numberOfTexts - 1;
1080 DocId c = a + (b - a)/2;
1081 if ((*textStartPos)[c] > i)
1083 else if ((*textStartPos)[c+1] > i)
1089 assert(a < (DocId)numberOfTexts);
1090 assert(i >= (*textStartPos)[a]);
1091 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1095 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1097 TCodeEntry *result = new TCodeEntry[ 256 ];
1099 count_chars( text, n, result );
1100 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1102 // First I push all the leaf nodes into the queue
1104 for ( unsigned int i = 0 ; i < 256 ; i++ )
1105 if ( result[ i ].count )
1106 q.push(node( i, result[ i ].count ) );
1108 // This loop removes the two smallest nodes from the
1109 // queue. It creates a new internal node that has
1110 // those two nodes as children. The new internal node
1111 // is then inserted into the priority queue. When there
1112 // is only one node in the priority queue, the tree
1116 while ( q.size() > 1 ) {
1117 node *child0 = new node( q.top() );
1119 node *child1 = new node( q.top() );
1121 q.push( node( child0, child1 ) );
1124 // Now I compute and return the codetable
1126 q.top().maketable(0u,0u, result);
1132 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1136 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1137 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1143 codetable[value].code = code;
1144 codetable[value].bits = bits;
1148 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1151 for (i = 0 ; i < 256 ; i++ )
1152 counts[ i ].count = 0;
1154 counts[(int)text[i]].count++;
1157 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1158 return x | (bit << pos);