1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
30 #include <cstring> // For strlen()
36 using SXSI::TextCollection;
39 ////////////////////////////////////////////////////////////////////////////
40 // Class CSA::THuffAlphabetRank
42 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
48 this->codetable = codetable;
50 bool *B = new bool[n];
53 for (i=0; i< n; i++) {
54 printf("%c:", (char)((int)s[i]-128));
55 for (r=0;r<codetable[(int)s[i]].bits;r++)
56 if (codetable[(int)s[i]].code & (1u <<r))
62 if (level > 100) return;*/
64 if (codetable[(int)s[i]].code & (1u << level)) {
69 if (sum==0 || sum==n) {
74 uchar *sfirst, *ssecond;
76 sfirst = new uchar[n-sum];
78 ssecond = new uchar[sum];
81 if (B[i]) ssecond[k++] = s[i];
82 else sfirst[j++] = s[i];
83 ulong *Binbits = new ulong[n/W+1];
85 Tools::SetField(Binbits,1,i,B[i]);
87 bitrank = new BitRank(Binbits,n,true);
89 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
93 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
99 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
100 // testing that the code works correctly
108 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
110 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
116 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
117 if (left!=NULL) delete left;
118 if (right!=NULL) delete right;
124 /////////////////////////////////////////////f///////////////////////////////
128 * Constructor inits an empty dynamic FM-index.
129 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
131 CSA::CSA(unsigned samplerate)
132 : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
133 codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0),
134 textLength(0), textStartPos(0)
136 this->samplerate = samplerate;
138 // FIXME TODO : DynFMI needs distribution of characters before hand
139 // This will create fully balanced wavelet tree for all chars [0, 255].
141 for (unsigned i = 0; i < 255; ++i)
144 dynFMI = new DynFMI(temp, 1, 255, false);
146 /* Debug code: take char distribution from data.txt.
147 uchar *temp = Tools::GetFileContents("data.txt", 0);
148 dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
155 * Given text must include end-marker.
156 * Text identifiers are numbered starting from 0.
158 void CSA::InsertText(uchar const * text)
163 TextPosition m = std::strlen((char *)text) + 1;
164 if (m > maxTextLength)
165 maxTextLength = m; // Store length of the longest text seen so far.
168 this->numberOfTexts ++;
169 dynFMI->addText(text, m);
172 void CSA::MakeStatic()
177 std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
181 uchar *bwt = dynFMI->getBWT();
182 /*printf("123456789012345678901234567890123456789\n");
183 for (TextPosition i = 0; i < n; i ++)
185 printf("%d", (int)bwt[i]);
187 printf("%c", bwt[i]);
191 /* for (TextPosition i = 1; i <= n; i ++)
193 printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
197 assert(numberOfTexts == dynFMI->getCollectionSize());
204 // Calculate BWT end-marker position (of last inserted text)
205 // and the length of the first text (counter l):
208 while (bwt[i] != '\0')
211 i = C[c]+alphabetrank->rank(c, i)-1;
215 //printf("bwtEndPos = %lu\n", bwtEndPos);
219 // Build up arrays for text length and starting positions
220 textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
221 textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
222 (*textLength)[0] = l;
223 (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables().
225 // Make sampling tables
229 bool CSA::EmptyText(DocId k) const
231 assert(k < (DocId)numberOfTexts);
232 return (1 == (*textLength)[k]);
235 uchar* CSA::GetText(DocId k) const
237 assert(k < (DocId)numberOfTexts);
239 uchar* result = new uchar[(*textLength)[k]];
241 TextPosition j = (*textLength)[k] - 1;
244 uchar c = alphabetrank->charAtPos(i);
249 i = C[c]+alphabetrank->rank(c,i)-1;
251 c = alphabetrank->charAtPos(i); // "next" char.
257 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
259 assert(k < (DocId)numberOfTexts);
260 assert(j < (*textLength)[k]);
263 // Start position of k'th text
264 ulong start = (*textStartPos)[k];
266 return Substring(i + start, j-i+1);
269 /******************************************************************
270 * Existential queries
272 bool CSA::IsPrefix(uchar const * pattern) const
274 TextPosition m = strlen((char *)pattern);
278 TextPosition sp = 0, ep = 0;
279 Search(pattern, m, &sp, &ep);
281 // Check for end-marker(s) in result interval
282 if (CountEndmarkers(sp, ep))
287 bool CSA::IsSuffix(uchar const *pattern) const
289 // Here counting is as fast as isSuffix():
290 if (CountSuffix(pattern) > 0)
295 bool CSA::IsEqual(uchar const *pattern) const
297 TextPosition m = std::strlen((char *)pattern);
299 // return false; // FIXME Check for empty texts!
301 TextPosition sp = 0, ep = 0;
302 // Match including end-marker
303 Search(pattern, m+1, &sp, &ep);
305 // Check for end-marker(s) in result interval
306 if (CountEndmarkers(sp, ep))
311 bool CSA::IsContains(uchar const * pattern) const
313 TextPosition m = strlen((char *)pattern);
317 TextPosition sp = 0, ep = 0;
318 // Just check if pattern exists somewhere
319 ulong count = Search(pattern, m, &sp, &ep);
326 bool CSA::IsLessThan(uchar const*) const
333 /******************************************************************
336 unsigned CSA::CountPrefix(uchar const * pattern) const
338 TextPosition m = strlen((char *)pattern);
340 return numberOfTexts;
342 TextPosition sp = 0, ep = 0;
343 Search(pattern, m, &sp, &ep);
345 // Count end-markers in result interval
346 return CountEndmarkers(sp, ep);
349 unsigned CSA::CountSuffix(uchar const * pattern) const
351 TextPosition m = strlen((char *)pattern);
353 return numberOfTexts;
355 TextPosition sp = 0, ep = 0;
356 // Search with end-marker
357 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
362 unsigned CSA::CountEqual(uchar const *pattern) const
364 TextPosition m = strlen((char const *)pattern);
366 // return ; // FIXME Test for empty texts.
368 TextPosition sp = 0, ep = 0;
369 // Match including end-marker
370 Search(pattern, m+1, &sp, &ep);
372 // Count end-markers in result interval
373 return CountEndmarkers(sp, ep);
376 unsigned CSA::CountContains(uchar const * pattern) const
378 // Here counting is as slow as fetching the result set
379 // because we would have to filter out occ's that fall within same document.
380 TextCollection::document_result result = Contains(pattern);
381 return result.size();
384 unsigned CSA::CountLessThan(uchar const * pattern) const
392 * Document reporting queries
394 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
396 TextPosition m = strlen((char *)pattern);
398 return TextCollection::document_result();
400 TextPosition sp = 0, ep = 0;
401 Search(pattern, m, &sp, &ep);
403 TextCollection::document_result result;
405 // Report end-markers in result interval
406 unsigned resultSize = CountEndmarkers(sp, ep);
412 i = alphabetrank->rank(0, sp - 1);
415 // End-marker we found belongs to the "preceeding" doc in the collection
416 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
417 result.push_back(docId);
425 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
427 TextPosition m = strlen((char *)pattern);
429 return TextCollection::document_result();
431 TextPosition sp = 0, ep = 0;
432 // Search with end-marker
433 Search(pattern, m+1, &sp, &ep);
435 TextCollection::document_result result;
437 // Check each occurrence
438 for (; sp <= ep; ++sp)
441 uchar c = alphabetrank->charAtPos(i);
442 while (c != '\0' && !sampled->IsBitSet(i))
444 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
445 c = alphabetrank->charAtPos(i);
449 // Rank among the end-markers in BWT
450 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
452 // End-marker that we found belongs to the "preceeding" doc in collection:
453 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
454 result.push_back(docId);
456 else // Sampled position
458 TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
459 result.push_back(DocIdAtTextPos(textPos));
466 TextCollection::document_result CSA::Equal(uchar const *pattern) const
468 TextPosition m = strlen((char const *)pattern);
470 return TextCollection::document_result();
472 TextPosition sp = 0, ep = 0;
473 // Match including end-marker
474 Search(pattern, m+1, &sp, &ep);
476 TextCollection::document_result result;
478 // Report end-markers in result interval
479 unsigned resultSize = CountEndmarkers(sp, ep);
485 i = alphabetrank->rank(0, sp - 1);
488 // End-marker we found belongs to the "previous" doc in the collection
489 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
490 result.push_back(docId);
499 TextCollection::document_result CSA::Contains(uchar const * pattern) const
501 TextPosition m = strlen((char *)pattern);
503 return TextCollection::document_result();
505 TextPosition sp = 0, ep = 0;
506 // Search all occurrences
507 Search(pattern, m, &sp, &ep);
509 // We want unique document indentifiers, using std::set to collect them
510 std::set<DocId> resultSet;
512 // Check each occurrence
513 for (; sp <= ep; ++sp)
516 uchar c = alphabetrank->charAtPos(i);
517 while (c != '\0' && !sampled->IsBitSet(i))
519 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
520 c = alphabetrank->charAtPos(i);
524 // Rank among the end-markers in BWT
525 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
527 // End-marker that we found belongs to the "preceeding" doc in collection:
528 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
529 resultSet.insert(docId);
533 TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
534 resultSet.insert(DocIdAtTextPos(textPos));
538 // Convert std::set to std::vector (TODO better way to construct result vector?)
539 TextCollection::document_result result;
540 for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
541 result.push_back(*it);
545 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
549 return document_result();
553 * Full result set queries
555 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
557 TextPosition m = strlen((char *)pattern);
559 return full_result();
561 TextPosition sp = 0, ep = 0;
562 // Search all occurrences
563 Search(pattern, m, &sp, &ep);
567 // Report each occurrence
568 for (; sp <= ep; ++sp)
571 TextPosition dist = 0;
572 uchar c = alphabetrank->charAtPos(i);
573 while (c != '\0' && !sampled->IsBitSet(i))
575 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
576 c = alphabetrank->charAtPos(i);
581 // Rank among the end-markers in BWT
582 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
584 // End-marker that we found belongs to the "preceeding" doc in collection:
585 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
586 result.push_back(make_pair(docId, dist));
590 TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist;
591 // Read document identifier and its starting position in text order
592 DocId docId = DocIdAtTextPos(textPos);
593 result.push_back(make_pair(docId, textPos - (*textStartPos)[docId]));
602 * Finds document identifier for given text position
604 * Starting text position of the document is stored into second parameter.
605 * Binary searching on text starting positions.
607 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
612 DocId b = numberOfTexts - 1;
615 DocId c = a + (b - a)/2;
616 if ((*textStartPos)[c] > i)
618 else if ((*textStartPos)[c+1] > i)
624 assert(a < (DocId)numberOfTexts);
625 assert(i > (*textStartPos)[a]);
626 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
631 * Count end-markers in given interval
633 unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const
640 ranksp = alphabetrank->rank(0, sp - 1);
642 return alphabetrank->rank(0, ep) - ranksp;
646 * Save index to a file handle
648 * Throws a std::runtime_error exception on i/o error.
649 * First byte that is saved represents the version number of the save file.
650 * In version 1 files, the data fields are:
651 * <uchar version> version info;
652 * <unsigned s> samplerate;
653 * <ulong n> length of the BWT;
654 * <ulong bwt> end marker position in BWT;
655 * <uchar *> BWT string of length n;
656 * <unsigned r> number of texts;
657 * <ulong max> length of longest text;
658 * <vector textLength>
659 * array of <ulong, ulong> pairs.
661 * TODO: Save the data structures instead of BWT sequence?
662 * TODO: Don't save textLength and textStartPos arrays.
664 void CSA::Save(FILE *file) const
666 // Saving version 1 data:
667 uchar versionFlag = 1;
668 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
669 throw std::runtime_error("CSA::Save(): file write error (version flag).");
671 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
672 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
674 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
675 throw std::runtime_error("CSA::Save(): file write error (n).");
677 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
678 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
680 for (ulong offset = 0; offset < n; offset ++)
682 uchar c = alphabetrank->charAtPos(offset);
683 if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
684 throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
687 unsigned r = numberOfTexts;
688 if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
689 throw std::runtime_error("CSA::Save(): file write error (r).");
691 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
692 throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
694 for (r = 0; r < numberOfTexts; ++ r)
696 if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1)
697 throw std::runtime_error("CSA::Save(): file write error (text length).");
698 if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1)
699 throw std::runtime_error("CSA::Save(): file write error (text start).");
705 * Load index from a file handle
707 * Throws a std::runtime_error exception on i/o error.
708 * For more info, see CSA::Save().
710 void CSA::Load(FILE *file, unsigned samplerate)
713 delete dynFMI; dynFMI = 0;
714 delete alphabetrank; alphabetrank = 0;
715 delete sampled; sampled = 0;
716 delete [] suffixes; suffixes = 0;
717 delete [] positions; positions = 0;
718 delete [] codetable; codetable = 0;
720 delete endmarkerDocId; endmarkerDocId = 0;
721 delete textLength; textLength = 0;
722 delete textStartPos; textStartPos = 0;
724 this->maxTextLength = 0;
725 this->numberOfTexts = 0;
726 this->samplerate = samplerate;
729 uchar versionFlag = 0;
730 if (std::fread(&versionFlag, 1, 1, file) != 1)
731 throw std::runtime_error("CSA::Load(): file read error (version flag).");
732 if (versionFlag != 1)
733 throw std::runtime_error("CSA::Load(): invalid start byte.");
735 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
736 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
737 if (this->samplerate == 0)
738 this->samplerate = samplerate;
740 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
741 throw std::runtime_error("CSA::Load(): file read error (n).");
743 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
744 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
746 uchar *bwt = new uchar[n];
747 for (ulong offset = 0; offset < n; offset ++)
748 if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
749 throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
752 if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
753 throw std::runtime_error("CSA::Load(): file read error (r).");
755 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
756 throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
758 // Build up arrays for text length and starting positions
759 textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength));
760 textStartPos = new BlockArray(r, Tools::CeilLog2(this->n));
764 TextPosition length = 0, start = 0;
765 if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
766 throw std::runtime_error("CSA::Load(): file read error (text length).");
767 if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
768 throw std::runtime_error("CSA::Load(): file read error (text start).");
770 (*textLength)[numberOfTexts] = length;
771 (*textStartPos)[numberOfTexts] = start;
776 // Construct data structures
779 maketables(); // FIXME: this will redo text length tables
785 * Rest of the functions follow...
788 uchar * CSA::Substring(TextPosition i, TextPosition l) const
790 uchar *result = new uchar[l + 1];
798 TextPosition k = i + l - 1;
799 // Check for end of the string
806 TextPosition skip = samplerate - k % samplerate - 1;
808 if (k / samplerate + 1 >= n / samplerate)
815 j = (*positions)[k/samplerate+1];
816 //cout << samplerate << ' ' << j << '\n';
819 for (dist = 0; dist < skip + l; dist++)
821 int c = alphabetrank->charAtPos(j);
824 // Rank among the end-markers in BWT
825 unsigned endmarkerRank = alphabetrank->rank(0, j) - 1;
826 j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker
829 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
831 result[l + skip - dist - 1] = c;
837 /*TextPosition CSA::inverse(TextPosition i)
839 TextPosition skip = samplerate - i % samplerate;
841 if (i / samplerate + 1 >= n / samplerate)
848 j = (*positions)[i/samplerate+1];
849 //cout << samplerate << ' ' << j << '\n';
854 int c = alphabetrank->charAtPos(j);
855 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
861 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
862 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
863 int c = (int)pattern[m-1];
865 TextPosition sp = C[c];
866 TextPosition ep = C[c+1]-1;
867 while (sp<=ep && i>=1)
869 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
870 c = (int)pattern[--i];
871 sp = C[c]+alphabetrank->rank(c,sp-1);
872 ep = C[c]+alphabetrank->rank(c,ep)-1;
882 /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
884 sp = C[(int)c]+alphabetrank->rank(c,sp-1);
885 ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
893 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
896 while (!sampled->IsBitSet(i))
898 int c = alphabetrank->charAtPos(i);
901 // End-markers are sampled
902 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
903 DocId docId = (*endmarkerDocId)[endmarkerRank];
904 return (*textStartPos)[docId] + dist;
906 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
910 return (*suffixes)[sampled->rank(i)-1]+dist;
921 delete endmarkerDocId;
926 void CSA::makewavelet(uchar *bwt)
935 if (C[i]>0) {min = i; break;}
936 for (i=255;i>=min;--i)
937 if (C[i]>0) {max = i; break;}
940 /*for(i = 0; i < 256; i++)
941 if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
944 ulong prev=C[0], temp;
946 for (i=1;i<256;i++) {
951 this->codetable = node::makecodetable(bwt,n);
952 alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
953 //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
955 /* for (i = 0; i < n; ++i)
957 uchar c = alphabetrank->charAtPos(i);
958 TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
959 printf("LF[%lu] = %lu\n", i, j);
963 void CSA::maketables()
965 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
967 ulong *sampledpositions = new ulong[n/W+1];
968 unsigned ceilLog2n = Tools::CeilLog2(n);
969 suffixes = new BlockArray(sampleLength, ceilLog2n);
970 positions = new BlockArray(sampleLength, ceilLog2n);
972 // Mapping from end-markers to doc ID's:
973 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
976 for (i=0;i<n/W+1;i++)
977 sampledpositions[i]=0lu;
980 // Keeping track of text position of end-markers seen
981 ulong posOfSuccEndmarker = n;
982 DocId textId = numberOfTexts;
987 for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
988 // i substitutes SA->GetPos(i)
991 if (x % samplerate == 0) {
992 Tools::SetField(sampledpositions,1,p,1);
993 (*positions)[x/samplerate] = p;
996 uchar c = alphabetrank->charAtPos(p);
1001 // Record the order of end-markers in BWT:
1002 ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1003 (*endmarkerDocId)[endmarkerRank] = textId;
1005 // Store text length and text start position:
1006 if (textId < (DocId)numberOfTexts - 1)
1008 (*textLength)[textId + 1] = posOfSuccEndmarker - x;
1009 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1010 posOfSuccEndmarker = x;
1013 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1014 p = textId; // Correct LF-mapping to the last char of the previous text.
1017 p = C[c]+alphabetrank->rank(c, p)-1;
1019 assert(textId == 0);
1022 for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
1024 int docc = (*endmarkerDocId)[i];
1025 ulong poss = (*endmarkerPos)[i];
1026 printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
1029 for (i = 0; i < numberOfTexts; ++ i)
1031 //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl;
1032 assert(dynTextLength[i].first == (*textLength)[i]);
1033 assert(dynTextLength[i].second == (*textStartPos)[i]);
1036 sampled = new BitRank(sampledpositions,n,true);
1039 for(i=0; i<sampleLength; i++) {
1040 j = sampled->rank((*positions)[i]);
1041 if (j==0) j=sampleLength;
1042 (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate;
1046 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1048 TCodeEntry *result = new TCodeEntry[ 256 ];
1050 count_chars( text, n, result );
1051 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1053 // First I push all the leaf nodes into the queue
1055 for ( unsigned int i = 0 ; i < 256 ; i++ )
1056 if ( result[ i ].count )
1057 q.push(node( i, result[ i ].count ) );
1059 // This loop removes the two smallest nodes from the
1060 // queue. It creates a new internal node that has
1061 // those two nodes as children. The new internal node
1062 // is then inserted into the priority queue. When there
1063 // is only one node in the priority queue, the tree
1067 while ( q.size() > 1 ) {
1068 node *child0 = new node( q.top() );
1070 node *child1 = new node( q.top() );
1072 q.push( node( child0, child1 ) );
1075 // Now I compute and return the codetable
1077 q.top().maketable(0u,0u, result);
1083 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1087 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1088 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1094 codetable[value].code = code;
1095 codetable[value].bits = bits;
1099 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1102 for (i = 0 ; i < 256 ; i++ )
1103 counts[ i ].count = 0;
1105 counts[(int)text[i]].count++;
1108 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1109 return x | (bit << pos);