1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
30 #include <cstring> // For strlen()
36 using SXSI::TextCollection;
38 // Save file version info
39 const uchar CSA::versionFlag = 2;
41 ////////////////////////////////////////////////////////////////////////////
42 // Class CSA::THuffAlphabetRank
44 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
50 this->codetable = codetable;
52 bool *B = new bool[n];
55 for (i=0; i< n; i++) {
56 printf("%c:", (char)((int)s[i]-128));
57 for (r=0;r<codetable[(int)s[i]].bits;r++)
58 if (codetable[(int)s[i]].code & (1u <<r))
64 if (level > 100) return;*/
66 if (codetable[(int)s[i]].code & (1u << level)) {
71 if (sum==0 || sum==n) {
76 uchar *sfirst, *ssecond;
78 sfirst = new uchar[n-sum];
80 ssecond = new uchar[sum];
83 if (B[i]) ssecond[k++] = s[i];
84 else sfirst[j++] = s[i];
85 ulong *Binbits = new ulong[n/W+1];
87 Tools::SetField(Binbits,1,i,B[i]);
89 bitrank = new BitRank(Binbits,n,true);
91 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
95 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
101 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
102 // testing that the code works correctly
110 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
112 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
118 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
119 if (left!=NULL) delete left;
120 if (right!=NULL) delete right;
126 * Saving data fields:
133 void CSA::THuffAlphabetRank::Save(FILE *file)
138 CSA::THuffAlphabetRank::THuffAlphabetRank(FILE *file)
144 /////////////////////////////////////////////f///////////////////////////////
148 * Constructor inits an empty dynamic FM-index.
149 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
151 CSA::CSA(unsigned samplerate)
152 : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0),
153 codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
154 textLength(0), textStartPos(0), emptyTextRank(0)
156 this->samplerate = samplerate;
158 // FIXME TODO : DynFMI needs distribution of characters before hand
159 // This will create fully balanced wavelet tree for all chars [0, 255].
161 for (unsigned i = 0; i < 255; ++i)
164 dynFMI = new DynFMI(temp, 1, 255, false);
166 /* Debug code: take char distribution from data.txt.
167 uchar *temp = Tools::GetFileContents("data.txt", 0);
168 dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
175 * Given text must include end-marker.
176 * Text identifiers are numbered starting from 0.
178 void CSA::InsertText(uchar const * text)
183 TextPosition m = std::strlen((char *)text) + 1;
184 if (m > maxTextLength)
185 maxTextLength = m; // Store length of the longest text seen so far.
190 this->numberOfTexts ++;
191 this->numberOfAllTexts ++;
192 dynFMI->addText(text, m);
196 emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
197 this->numberOfAllTexts ++; // Replace with dynamic bitvector
201 void CSA::MakeStatic()
205 throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
207 // Bitvector of empty texts
209 //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
210 ulong *tempB = new ulong[numberOfAllTexts/W + 1];
211 for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
213 for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
214 Tools::SetField(tempB, 1, (*it), 1);
216 emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
219 uchar *bwt = dynFMI->getBWT();
220 /*printf("123456789012345678901234567890123456789\n");
221 for (TextPosition i = 0; i < n; i ++)
223 printf("%d", (int)bwt[i]);
225 printf("%c", bwt[i]);
229 /* for (TextPosition i = 1; i <= n; i ++)
231 printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
235 assert(numberOfTexts == dynFMI->getCollectionSize());
240 makewavelet(bwt); // Deletes bwt!
243 // Calculate BWT end-marker position (of last inserted text)
244 // and the length of the first text (counter l):
248 uchar c = alphabetrank->access(i);
251 i = C[c]+alphabetrank->rank(c, i)-1;
253 c = alphabetrank->access(i);
256 assert(bwtEndPos < n);
257 //printf("bwtEndPos = %lu\n", bwtEndPos);
259 // Build up arrays for text length and starting positions
260 // FIXME Remove, temp data
261 textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
262 textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
263 (*textLength)[0] = l;
264 (*textStartPos)[0] = 0; // Rest of the values are updated in CSA::maketables().
266 // Make sampling tables
270 bool CSA::EmptyText(DocId k) const
272 assert(k < (DocId)numberOfTexts);
273 if (emptyTextRank->IsBitSet(k))
278 uchar* CSA::GetText(DocId k) const
280 assert(k < (DocId)numberOfTexts);
283 if (emptyTextRank->IsBitSet(k, &textRank))
285 uchar* result = new uchar[1];
289 // Map to non-empty text
290 k -= textRank; //emptyTextRank->rank(k);
295 // Reserve average string length to avoid reallocs
296 result.reserve(n/numberOfTexts);
298 uchar c = alphabetrank->access(i);
302 i = C[c]+alphabetrank->rank(c,i)-1;
304 c = alphabetrank->access(i); // "next" char.
307 // Convert to uchar (FIXME return string?)
309 uchar* res = new uchar[i+1];
311 for (ulong j = 0; j < i; ++j)
312 res[i-j-1] = result[j];
317 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
319 assert(k < (DocId)numberOfTexts);
320 assert(j < (*textLength)[k]);
324 if (emptyTextRank->IsBitSet(k, &textRank))
326 uchar* result = new uchar[1];
331 // Map to non-empty text
332 k -= textRank; // emptyTextRank->rank(k);
334 // Start position of k'th text
335 ulong start = (*textStartPos)[k];
337 return Substring(i + start, j-i+1);
340 /******************************************************************
341 * Existential queries
343 bool CSA::IsPrefix(uchar const * pattern) const
345 TextPosition m = strlen((char *)pattern);
349 TextPosition sp = 0, ep = 0;
350 Search(pattern, m, &sp, &ep);
352 // Check for end-marker(s) in result interval
353 if (CountEndmarkers(sp, ep))
358 bool CSA::IsSuffix(uchar const *pattern) const
360 // Here counting is as fast as IsSuffix():
361 if (CountSuffix(pattern) > 0)
366 bool CSA::IsEqual(uchar const *pattern) const
368 TextPosition m = std::strlen((char *)pattern);
371 if (numberOfAllTexts - numberOfTexts > 0u)
372 return true; // Empty texts exists
373 return false; // No empty texts exists
376 TextPosition sp = 0, ep = 0;
377 // Match including end-marker
378 Search(pattern, m+1, &sp, &ep);
380 // Check for end-marker(s) in result interval
381 if (CountEndmarkers(sp, ep))
386 bool CSA::IsContains(uchar const * pattern) const
388 TextPosition m = strlen((char *)pattern);
392 TextPosition sp = 0, ep = 0;
393 // Just check if pattern exists somewhere
394 ulong count = Search(pattern, m, &sp, &ep);
401 bool CSA::IsLessThan(uchar const*) const
408 /******************************************************************
411 unsigned CSA::CountPrefix(uchar const * pattern) const
413 TextPosition m = strlen((char *)pattern);
415 return numberOfAllTexts;
417 TextPosition sp = 0, ep = 0;
418 Search(pattern, m, &sp, &ep);
420 // Count end-markers in result interval
421 return CountEndmarkers(sp, ep);
424 unsigned CSA::CountSuffix(uchar const * pattern) const
426 TextPosition m = strlen((char *)pattern);
428 return numberOfAllTexts;
430 TextPosition sp = 0, ep = 0;
431 // Search with end-marker
432 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
437 unsigned CSA::CountEqual(uchar const *pattern) const
439 TextPosition m = strlen((char const *)pattern);
441 return numberOfAllTexts - numberOfTexts; // Empty texts.
443 TextPosition sp = 0, ep = 0;
444 // Match including end-marker
445 Search(pattern, m+1, &sp, &ep);
447 // Count end-markers in result interval
448 return CountEndmarkers(sp, ep);
451 unsigned CSA::CountContains(uchar const * pattern) const
453 TextPosition m = strlen((char const *)pattern);
455 return numberOfAllTexts; // Total number of texts.
457 // Here counting is as slow as fetching the result set
458 // because we would have to filter out occ's that fall within same document.
459 TextCollection::document_result result = Contains(pattern);
460 return result.size();
463 unsigned CSA::CountLessThan(uchar const * pattern) const
471 * Document reporting queries
473 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
475 TextPosition m = strlen((char *)pattern);
477 return TextCollection::document_result(); // FIXME Should return all 1...k
479 TextPosition sp = 0, ep = 0;
480 Search(pattern, m, &sp, &ep);
482 TextCollection::document_result result;
484 // Report end-markers in result interval
485 unsigned resultSize = CountEndmarkers(sp, ep);
489 result.reserve(resultSize); // Try to avoid reallocation.
491 // Iterate through end-markers in [sp,ep]:
494 i = alphabetrank->rank(0, sp - 1);
497 // End-marker we found belongs to the "preceeding" doc in the collection
498 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
500 docId = emptyTextRank->select0(docId+1);
501 result.push_back(docId);
510 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
512 TextPosition m = strlen((char *)pattern);
514 return TextCollection::document_result(); // FIXME Should return all 1...k
516 TextPosition sp = 0, ep = 0;
517 // Search with end-marker
518 Search(pattern, m+1, &sp, &ep);
520 TextCollection::document_result result;
521 result.reserve(ep-sp+1); // Try to avoid reallocation.
523 ulong sampled_rank_i = 0;
524 // Check each occurrence
525 for (; sp <= ep; ++sp)
529 uchar c = alphabetrank->access(i);
530 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
532 i = C[c]+alphabetrank->rank(c,i)-1;
533 c = alphabetrank->access(i);
535 // Assert: c == '\0' OR sampled->IsBitSet(i)
539 // Rank among the end-markers in BWT
540 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
542 // End-marker that we found belongs to the "preceeding" doc in collection:
543 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
545 docId = emptyTextRank->select0(docId+1);
546 result.push_back(docId);
548 else // Sampled position
550 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
552 docId = emptyTextRank->select0(docId+1);
553 result.push_back(docId);
560 TextCollection::document_result CSA::Equal(uchar const *pattern) const
562 TextPosition m = strlen((char const *)pattern);
564 return TextCollection::document_result(); // FIXME Should return all empty texts
566 TextPosition sp = 0, ep = 0;
567 // Match including end-marker
568 Search(pattern, m+1, &sp, &ep);
570 TextCollection::document_result result;
572 // Report end-markers in result interval
573 unsigned resultSize = CountEndmarkers(sp, ep);
577 result.reserve(resultSize); // Try to avoid reallocation.
581 i = alphabetrank->rank(0, sp - 1);
584 // End-marker we found belongs to the "previous" doc in the collection
585 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
587 docId = emptyTextRank->select0(docId+1);
588 result.push_back(docId);
598 TextCollection::document_result CSA::Contains(uchar const * pattern) const
600 TextPosition m = strlen((char *)pattern);
602 return TextCollection::document_result();
604 TextPosition sp = 0, ep = 0;
605 // Search all occurrences
606 Search(pattern, m, &sp, &ep);
608 // We want unique document indentifiers, using std::set to collect them
609 std::set<DocId> resultSet;
611 ulong sampled_rank_i = 0;
612 // Check each occurrence
613 for (; sp <= ep; ++sp)
616 uchar c = alphabetrank->access(i);
617 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
619 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
620 c = alphabetrank->access(i);
624 // Rank among the end-markers in BWT
625 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
627 // End-marker that we found belongs to the "preceeding" doc in collection:
628 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
629 resultSet.insert(docId);
633 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
634 assert((unsigned)di < numberOfTexts);
635 resultSet.insert(di);
639 // Convert std::set to std::vector
640 TextCollection::document_result result(resultSet.begin(), resultSet.end());
642 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
643 *it = emptyTextRank->select0(*it+1);
647 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
651 return document_result();
655 * Full result set queries
657 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
659 TextPosition m = strlen((char *)pattern);
661 return full_result(); // FIXME Throw exception?
663 TextPosition sp = 0, ep = 0;
664 // Search all occurrences
665 Search(pattern, m, &sp, &ep);
668 result.reserve(ep-sp+1); // Try to avoid reallocation.
670 ulong sampled_rank_i = 0;
671 // Report each occurrence
672 for (; sp <= ep; ++sp)
675 TextPosition dist = 0;
676 uchar c = alphabetrank->access(i);
677 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
679 i = C[c]+alphabetrank->rank(c,i)-1;
680 c = alphabetrank->access(i);
685 // Rank among the end-markers in BWT
686 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
688 // End-marker that we found belongs to the "preceeding" doc in collection:
689 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
691 docId = emptyTextRank->select0(docId+1);
692 result.push_back(make_pair(docId, dist));
696 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
697 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
698 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
701 docId = emptyTextRank->select0(docId+1);
702 result.push_back(make_pair(docId, textPos));
711 * Save index to a file handle
713 * Throws a std::runtime_error exception on i/o error.
714 * First byte that is saved represents the version number of the save file.
715 * In version 2 files, the data fields are:
720 TextPosition bwtEndPos;
721 static_sequence * alphabetrank;
723 BlockArray *suffixes;
724 BlockArray *suffixDocId;
725 unsigned numberOfTexts;
726 unsigned numberOfAllTexts;
728 BlockArray *endmarkerDocId;
729 BSGAP *emptyTextRank;
731 void CSA::Save(FILE *file) const
733 // Saving version info:
734 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
735 throw std::runtime_error("CSA::Save(): file write error (version flag).");
737 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
738 throw std::runtime_error("CSA::Save(): file write error (n).");
739 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
740 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
742 for(ulong i = 0; i < 256; ++i)
743 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
744 throw std::runtime_error("CSA::Save(): file write error (C table).");
746 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
747 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
749 alphabetrank->save(file);
751 suffixes->Save(file);
752 suffixDocId->Save(file);
754 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
755 throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
756 if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
757 throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
758 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
759 throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
761 endmarkerDocId->Save(file);
762 emptyTextRank->Save(file);
768 * Load index from a file handle
770 * Throws a std::runtime_error exception on i/o error.
771 * For more info, see CSA::Save().
773 void CSA::Load(FILE *file, unsigned samplerate)
776 delete dynFMI; dynFMI = 0;
777 delete alphabetrank; alphabetrank = 0;
778 delete sampled; sampled = 0;
779 delete suffixes; suffixes = 0;
780 delete suffixDocId; suffixDocId = 0;
781 delete positions; positions = 0;
782 delete [] codetable; codetable = 0;
784 delete endmarkerDocId; endmarkerDocId = 0;
785 delete emptyTextRank; emptyTextRank = 0;
786 // FIXME Remove following:
787 delete textLength; textLength = 0;
788 delete textStartPos; textStartPos = 0;
790 this->maxTextLength = 0;
791 this->numberOfTexts = 0;
792 this->numberOfAllTexts = 0;
793 this->samplerate = samplerate;
797 if (std::fread(&verFlag, 1, 1, file) != 1)
798 throw std::runtime_error("CSA::Load(): file read error (version flag).");
799 if (verFlag != CSA::versionFlag)
800 throw std::runtime_error("CSA::Load(): invalid save file version.");
802 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
803 throw std::runtime_error("CSA::Load(): file read error (n).");
804 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
805 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
806 // FIXME samplerate can not be changed during load.
807 // if (this->samplerate == 0)
808 // this->samplerate = samplerate;
810 for(ulong i = 0; i < 256; ++i)
811 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
812 throw std::runtime_error("CSA::Load(): file read error (C table).");
814 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
815 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
817 alphabetrank = static_sequence::load(file);
818 sampled = new BSGAP(file);
819 suffixes = new BlockArray(file);
820 suffixDocId = new BlockArray(file);
822 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
823 throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
824 if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
825 throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
826 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
827 throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
829 endmarkerDocId = new BlockArray(file);
830 emptyTextRank = new BSGAP(file);
832 // FIXME Construct data structures with new samplerate
833 //maketables(); // FIXME: this will redo text length tables
839 * Rest of the functions follow...
844 uchar * CSA::Substring(TextPosition i, TextPosition l) const
846 uchar *result = new uchar[l + 1];
854 TextPosition k = i + l - 1;
855 // Check for end of the string
862 TextPosition skip = samplerate - k % samplerate - 1;
864 if (k / samplerate + 1 >= n / samplerate)
871 j = (*positions)[k/samplerate+1];
872 //cout << samplerate << ' ' << j << '\n';
875 for (dist = 0; dist < skip + l; dist++)
877 int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp);
880 // Rank among the end-markers in BWT
881 unsigned endmarkerRank = alphabetrank->rank(0, j) - 1;
882 j = (*endmarkerDocId)[endmarkerRank]; // LF-mapping for end-marker
885 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
887 result[l + skip - dist - 1] = c;
893 /*TextPosition CSA::inverse(TextPosition i)
895 TextPosition skip = samplerate - i % samplerate;
897 if (i / samplerate + 1 >= n / samplerate)
904 j = (*positions)[i/samplerate+1];
905 //cout << samplerate << ' ' << j << '\n';
910 int c = alphabetrank->charAtPos(j);
911 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
917 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
919 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
920 int c = (int)pattern[m-1];
922 TextPosition sp = C[c];
923 TextPosition ep = C[c+1]-1;
924 while (sp<=ep && i>=1)
926 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
927 c = (int)pattern[--i];
928 sp = C[c]+alphabetrank->rank(c,sp-1);
929 ep = C[c]+alphabetrank->rank(c,ep)-1;
939 /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
941 sp = C[(int)c]+alphabetrank->rank(c,sp-1);
942 ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
950 CSA::TextPosition CSA::Lookup(TextPosition i) const
953 while (!sampled->IsBitSet(i))
956 int c = alphabetrank->access(i);
959 // End-markers are sampled
960 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
961 DocId docId = (*endmarkerDocId)[endmarkerRank];
962 return (*textStartPos)[docId] + dist;
964 i = C[c]+alphabetrank->rank(c,i)-1;
968 return (*suffixes)[sampled->rank(i)-1]+dist;
980 delete endmarkerDocId;
983 delete emptyTextRank;
986 void CSA::makewavelet(uchar *bwt)
995 if (C[i]>0) {min = i; break;}
996 for (i=255;i>=min;--i)
997 if (C[i]>0) {max = i; break;}
1000 /*for(i = 0; i < 256; i++)
1001 if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
1004 ulong prev=C[0], temp;
1006 for (i=1;i<256;i++) {
1011 // this->codetable = node::makecodetable(bwt,n);
1012 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
1014 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
1016 uint *text = new uint[n];
1017 for (i = 0; i < n; ++i) // Silly
1022 alphabet_mapper * am = new alphabet_mapper_none();
1023 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
1025 //cout << "Building Huffman table..."; cout.flush();
1027 wt_coder * wtc = new wt_coder_huff(text,n,am);
1029 //cout << "done" << endl; cout.flush();
1030 //cout << "Building static_sequence..."; cout.flush();
1032 alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
1036 /* for (i = 0; i < n; ++i)
1038 uchar c = alphabetrank->charAtPos(i);
1039 TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
1040 printf("LF[%lu] = %lu\n", i, j);
1044 void CSA::maketables()
1046 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
1048 ulong *sampledpositions = new ulong[n/W+1];
1049 unsigned ceilLog2n = Tools::CeilLog2(n);
1050 positions = new BlockArray(sampleLength, ceilLog2n);
1051 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
1053 // Mapping from end-markers to doc ID's:
1054 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
1057 for (i=0;i<n/W+1;i++)
1058 sampledpositions[i]=0lu;
1060 ulong x,p=bwtEndPos;
1061 ulong sampleCount = 0;
1062 // Keeping track of text position of end-markers seen
1063 ulong posOfSuccEndmarker = n;
1064 DocId textId = numberOfTexts;
1069 for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
1070 // i substitutes SA->GetPos(i)
1073 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1074 Tools::SetField(sampledpositions,1,p,1);
1075 (*positions)[sampleCount] = p;
1076 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1080 uchar c = alphabetrank->access(p);
1085 // Record the order of end-markers in BWT:
1086 ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1087 (*endmarkerDocId)[endmarkerRank] = textId;
1089 // Store text length and text start position:
1090 if (textId < (DocId)numberOfTexts - 1)
1092 (*textLength)[textId + 1] = posOfSuccEndmarker - x;
1093 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1094 posOfSuccEndmarker = x;
1097 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1098 p = textId; // Correct LF-mapping to the last char of the previous text.
1101 p = C[c]+alphabetrank->rank(c, p)-1;
1103 assert(textId == 0);
1106 for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it, ++i)
1108 int docc = (*endmarkerDocId)[i];
1109 ulong poss = (*endmarkerPos)[i];
1110 printf("endm[%u] = %lu (text pos: %lu) (recorded: %d, %lu)\n", (it->second).first, it->first, (it->second).second, docc, poss);
1113 for (i = 0; i < numberOfTexts; ++ i)
1115 //std::cout << "textlength = " << dynTextLength[i].first << " vs " << (*textLength)[i] << ", textStartPos = " << dynTextLength[i].second << " vs " << (*textStartPos)[i] << std::endl;
1116 assert(dynTextLength[i].first == (*textLength)[i]);
1117 assert(dynTextLength[i].second == (*textStartPos)[i]);
1120 sampled = new BSGAP(sampledpositions,n,true);
1121 sampleLength = sampled->rank(n-1);
1122 assert(sampleCount == sampleLength);
1123 // std::cout << ";sampleLength;" << sampleLength << std::endl;
1124 // Suffixes == offset from text start position
1125 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1126 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1129 for(i=0; i<sampleLength; i++) {
1130 assert((*positions)[i] < n);
1131 j = sampled->rank((*positions)[i]);
1132 if (j==0) j=sampleLength;
1133 TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate;
1134 (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]);
1136 assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts);
1137 assert((*suffixDocId)[j-1] < numberOfTexts);
1138 // calculate offset from text start:
1139 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1141 // FIXME Temp, remove
1148 delete textStartPos;
1154 * Finds document identifier for given text position
1156 * Starting text position of the document is stored into second parameter.
1157 * Binary searching on text starting positions.
1159 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
1164 DocId b = numberOfTexts - 1;
1167 DocId c = a + (b - a)/2;
1168 if ((*textStartPos)[c] > i)
1170 else if ((*textStartPos)[c+1] > i)
1176 assert(a < (DocId)numberOfTexts);
1177 assert(i >= (*textStartPos)[a]);
1178 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1182 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1184 TCodeEntry *result = new TCodeEntry[ 256 ];
1186 count_chars( text, n, result );
1187 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1189 // First I push all the leaf nodes into the queue
1191 for ( unsigned int i = 0 ; i < 256 ; i++ )
1192 if ( result[ i ].count )
1193 q.push(node( i, result[ i ].count ) );
1195 // This loop removes the two smallest nodes from the
1196 // queue. It creates a new internal node that has
1197 // those two nodes as children. The new internal node
1198 // is then inserted into the priority queue. When there
1199 // is only one node in the priority queue, the tree
1203 while ( q.size() > 1 ) {
1204 node *child0 = new node( q.top() );
1206 node *child1 = new node( q.top() );
1208 q.push( node( child0, child1 ) );
1211 // Now I compute and return the codetable
1213 q.top().maketable(0u,0u, result);
1219 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1223 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1224 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1230 codetable[value].code = code;
1231 codetable[value].bits = bits;
1235 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1238 for (i = 0 ; i < 256 ; i++ )
1239 counts[ i ].count = 0;
1241 counts[(int)text[i]].count++;
1244 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1245 return x | (bit << pos);