1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
23 // Conflicts with std::min and std::max
26 #include "dcover/bwt.hpp"
28 #include "HeapProfiler.h" // FIXME remove
37 #include <cstring> // For strlen()
43 using SXSI::TextCollection;
45 // Save file version info
46 const uchar CSA::versionFlag = 2;
48 ////////////////////////////////////////////////////////////////////////////
49 // Class CSA::THuffAlphabetRank
51 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
57 this->codetable = codetable;
59 bool *B = new bool[n];
62 for (i=0; i< n; i++) {
63 printf("%c:", (char)((int)s[i]-128));
64 for (r=0;r<codetable[(int)s[i]].bits;r++)
65 if (codetable[(int)s[i]].code & (1u <<r))
71 if (level > 100) return;*/
73 if (codetable[(int)s[i]].code & (1u << level)) {
78 if (sum==0 || sum==n) {
83 uchar *sfirst, *ssecond;
85 sfirst = new uchar[n-sum];
87 ssecond = new uchar[sum];
90 if (B[i]) ssecond[k++] = s[i];
91 else sfirst[j++] = s[i];
92 ulong *Binbits = new ulong[n/W+1];
94 Tools::SetField(Binbits,1,i,B[i]);
96 bitrank = new BitRank(Binbits,n,true);
98 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
102 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
108 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
109 // testing that the code works correctly
117 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
119 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
125 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
126 if (left!=NULL) delete left;
127 if (right!=NULL) delete right;
133 ////////////////////////////////////////////////////////////////////////////
137 * Constructor inits an empty dynamic FM-index.
138 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
140 CSA::CSA(unsigned samplerate)
141 : n(0), samplerate(0), alphabetrank(0), codetable(0), sampled(0), suffixes(0),
142 suffixDocId(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
145 this->samplerate = samplerate;
148 // FIXME TODO : DynFMI needs distribution of characters before hand
149 // This will create fully balanced wavelet tree for all chars [0, 255].
151 for (unsigned i = 0; i < 255; ++i)
154 dynFMI = new DynFMI(temp, 1, 255, false);
156 /* Debug code: take char distribution from data.txt.
157 uchar *temp = Tools::GetFileContents("data.txt", 0);
158 dynFMI = new DynFMI(temp,1790000, strlen((char *)temp), false);
166 * Given text must include end-marker.
167 * Text identifiers are numbered starting from 0.
169 void CSA::InsertText(uchar const * text)
176 TextPosition m = std::strlen((char *)text) + 1;
177 if (m > maxTextLength)
178 maxTextLength = m; // Store length of the longest text seen so far.
183 this->numberOfTexts ++;
184 this->numberOfAllTexts ++;
186 dynFMI->addText(text, m);
189 texts.append((char *) text);
190 texts.append(1, '\0');
194 emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
195 this->numberOfAllTexts ++;
199 void CSA::MakeStatic()
204 throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
207 // Bitvector of empty texts
209 //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
210 ulong *tempB = new ulong[numberOfAllTexts/W + 1];
211 for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
213 for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
214 Tools::SetField(tempB, 1, (*it), 1);
216 emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
219 /* std::cout << "texts: ";
220 for (ulong i = 0; i < n; i++)
222 std::cout << texts[i];
224 std::cout << (int)texts[i];
225 std::cout << std::endl;*/
226 uchar *bwt = new uchar[n];
228 // FIXME More succinct solution needed
229 unsigned* maptexts = new unsigned[n+1];
230 // Map text chars to [0..255+numberOfTexts]
232 for (ulong i = 0; i < n; ++i)
234 maptexts[i] = ++count; // endmarkers \in [1..numberOfTexts]
236 uchar c = (uchar)texts[i];
237 maptexts[i] = (unsigned)c + numberOfTexts + 1;
242 texts.reserve(0); // Release capacity
243 assert(count == numberOfTexts);
245 bwtEndPos = (ulong)compute_bwt(&maptexts[0], &maptexts[n],
246 &bwt[0], numberOfTexts + 1);
248 bwt[--bwtEndPos] = '\0';
250 } // End of bw transform
254 uchar *bwtTest = dynFMI->getBWT();
255 /* printf("123456789012345678901234567890123456789\n");
256 for (TextPosition i = 0; i < n && i < 100; i ++)
258 printf("%d", (int)bwt[i]);
260 printf("%c", bwt[i]);
262 for (TextPosition i = 0; i < n && i < 100; i ++)
264 printf("%d", (int)bwtTest[i]);
266 printf("%c", bwtTest[i]);
270 assert(numberOfTexts == dynFMI->getCollectionSize());
274 for (ulong i = 0; i < n; ++i)
275 if (bwt[i] != bwtTest[i])
277 std::cout << "i = " << i << ", bwt = " << (unsigned)bwt[i] << ", " << (unsigned)bwtTest[i] << std::endl;
282 #endif // CSA_TEST_BWT
284 makewavelet(bwt); // Deletes bwt!
287 // Make sampling tables
291 bool CSA::EmptyText(DocId k) const
293 assert(k < (DocId)numberOfTexts);
294 if (emptyTextRank->IsBitSet(k))
299 uchar* CSA::GetText(DocId k) const
301 assert(k < (DocId)numberOfTexts);
304 if (emptyTextRank->IsBitSet(k, &textRank))
306 uchar* result = new uchar[1];
310 // Map to non-empty text
311 k -= textRank; //emptyTextRank->rank(k);
316 // Reserve average string length to avoid reallocs
317 result.reserve(n/numberOfTexts);
319 uchar c = alphabetrank->access(i);
323 i = C[c]+alphabetrank->rank(c,i)-1;
325 c = alphabetrank->access(i); // "next" char.
328 // Convert to uchar (FIXME return string?)
330 uchar* res = new uchar[i+1];
332 for (ulong j = 0; j < i; ++j)
333 res[i-j-1] = result[j];
338 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
340 assert(k < (DocId)numberOfTexts);
341 assert(j < (*textLength)[k]);
345 if (emptyTextRank->IsBitSet(k, &textRank))
347 uchar* result = new uchar[1];
352 // Map to non-empty text
353 k -= textRank; // emptyTextRank->rank(k);
355 // Start position of k'th text
356 ulong start = (*textStartPos)[k];
358 return Substring(i + start, j-i+1);
361 /******************************************************************
362 * Existential queries
364 bool CSA::IsPrefix(uchar const * pattern) const
366 TextPosition m = strlen((char *)pattern);
370 TextPosition sp = 0, ep = 0;
371 Search(pattern, m, &sp, &ep);
373 // Check for end-marker(s) in result interval
374 if (CountEndmarkers(sp, ep))
379 bool CSA::IsSuffix(uchar const *pattern) const
381 // Here counting is as fast as IsSuffix():
382 if (CountSuffix(pattern) > 0)
387 bool CSA::IsEqual(uchar const *pattern) const
389 TextPosition m = std::strlen((char *)pattern);
392 if (numberOfAllTexts - numberOfTexts > 0u)
393 return true; // Empty texts exists
394 return false; // No empty texts exists
397 TextPosition sp = 0, ep = 0;
398 // Match including end-marker
399 Search(pattern, m+1, &sp, &ep);
401 // Check for end-marker(s) in result interval
402 if (CountEndmarkers(sp, ep))
407 bool CSA::IsContains(uchar const * pattern) const
409 TextPosition m = strlen((char *)pattern);
413 TextPosition sp = 0, ep = 0;
414 // Just check if pattern exists somewhere
415 ulong count = Search(pattern, m, &sp, &ep);
422 bool CSA::IsLessThan(uchar const*) const
429 /******************************************************************
432 ulong CSA::Count(uchar const * pattern) const
434 TextPosition m = strlen((char *)pattern);
438 TextPosition sp = 0, ep = 0;
439 unsigned count = (unsigned) Search(pattern, m, &sp, &ep);
443 unsigned CSA::CountPrefix(uchar const * pattern) const
445 TextPosition m = strlen((char *)pattern);
447 return numberOfAllTexts;
449 TextPosition sp = 0, ep = 0;
450 Search(pattern, m, &sp, &ep);
452 // Count end-markers in result interval
453 return CountEndmarkers(sp, ep);
456 unsigned CSA::CountSuffix(uchar const * pattern) const
458 TextPosition m = strlen((char *)pattern);
460 return numberOfAllTexts;
462 TextPosition sp = 0, ep = 0;
463 // Search with end-marker
464 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
469 unsigned CSA::CountEqual(uchar const *pattern) const
471 TextPosition m = strlen((char const *)pattern);
473 return numberOfAllTexts - numberOfTexts; // Empty texts.
475 TextPosition sp = 0, ep = 0;
476 // Match including end-marker
477 Search(pattern, m+1, &sp, &ep);
479 // Count end-markers in result interval
480 return CountEndmarkers(sp, ep);
483 unsigned CSA::CountContains(uchar const * pattern) const
485 TextPosition m = strlen((char const *)pattern);
487 return numberOfAllTexts; // Total number of texts.
489 // Here counting is as slow as fetching the result set
490 // because we would have to filter out occ's that fall within same document.
491 TextCollection::document_result result = Contains(pattern);
492 return result.size();
495 unsigned CSA::CountLessThan(uchar const * pattern) const
503 * Document reporting queries
505 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
507 TextPosition m = strlen((char *)pattern);
509 return TextCollection::document_result(); // FIXME Should return all 1...k
511 TextPosition sp = 0, ep = 0;
512 Search(pattern, m, &sp, &ep);
514 TextCollection::document_result result;
516 // Report end-markers in result interval
517 unsigned resultSize = CountEndmarkers(sp, ep);
521 result.reserve(resultSize); // Try to avoid reallocation.
523 // Iterate through end-markers in [sp,ep]:
526 i = alphabetrank->rank(0, sp - 1);
529 // End-marker we found belongs to the "preceeding" doc in the collection
530 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
532 docId = emptyTextRank->select0(docId+1);
533 result.push_back(docId);
542 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
544 TextPosition m = strlen((char *)pattern);
546 return TextCollection::document_result(); // FIXME Should return all 1...k
548 TextPosition sp = 0, ep = 0;
549 // Search with end-marker
550 Search(pattern, m+1, &sp, &ep);
552 TextCollection::document_result result;
553 result.reserve(ep-sp+1); // Try to avoid reallocation.
555 ulong sampled_rank_i = 0;
556 // Check each occurrence
557 for (; sp <= ep; ++sp)
561 uchar c = alphabetrank->access(i);
562 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
564 i = C[c]+alphabetrank->rank(c,i)-1;
565 c = alphabetrank->access(i);
567 // Assert: c == '\0' OR sampled->IsBitSet(i)
571 // Rank among the end-markers in BWT
572 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
574 // End-marker that we found belongs to the "preceeding" doc in collection:
575 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
577 docId = emptyTextRank->select0(docId+1);
578 result.push_back(docId);
580 else // Sampled position
582 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
584 docId = emptyTextRank->select0(docId+1);
585 result.push_back(docId);
592 TextCollection::document_result CSA::Equal(uchar const *pattern) const
594 TextPosition m = strlen((char const *)pattern);
596 return TextCollection::document_result(); // FIXME Should return all empty texts
598 TextPosition sp = 0, ep = 0;
599 // Match including end-marker
600 Search(pattern, m+1, &sp, &ep);
602 TextCollection::document_result result;
604 // Report end-markers in result interval
605 unsigned resultSize = CountEndmarkers(sp, ep);
609 result.reserve(resultSize); // Try to avoid reallocation.
613 i = alphabetrank->rank(0, sp - 1);
616 // End-marker we found belongs to the "previous" doc in the collection
617 DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
619 docId = emptyTextRank->select0(docId+1);
620 result.push_back(docId);
630 TextCollection::document_result CSA::Contains(uchar const * pattern) const
632 TextPosition m = strlen((char *)pattern);
634 return TextCollection::document_result();
636 TextPosition sp = 0, ep = 0;
637 // Search all occurrences
638 Search(pattern, m, &sp, &ep);
640 // We want unique document indentifiers, using std::set to collect them
641 std::set<DocId> resultSet;
643 ulong sampled_rank_i = 0;
644 // Check each occurrence
645 for (; sp <= ep; ++sp)
648 uchar c = alphabetrank->access(i);
649 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
651 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
652 c = alphabetrank->access(i);
656 // Rank among the end-markers in BWT
657 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
659 // End-marker that we found belongs to the "preceeding" doc in collection:
660 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
661 resultSet.insert(docId);
665 DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
666 assert((unsigned)di < numberOfTexts);
667 resultSet.insert(di);
671 // Convert std::set to std::vector
672 TextCollection::document_result result(resultSet.begin(), resultSet.end());
674 for (document_result::iterator it = result.begin(); it != result.end(); ++it)
675 *it = emptyTextRank->select0(*it+1);
679 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
683 return document_result();
687 * Full result set queries
689 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
691 TextPosition m = strlen((char *)pattern);
693 return full_result(); // FIXME Throw exception?
695 TextPosition sp = 0, ep = 0;
696 // Search all occurrences
697 Search(pattern, m, &sp, &ep);
700 result.reserve(ep-sp+1); // Try to avoid reallocation.
702 ulong sampled_rank_i = 0;
703 // Report each occurrence
704 for (; sp <= ep; ++sp)
707 TextPosition dist = 0;
708 uchar c = alphabetrank->access(i);
709 while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
711 i = C[c]+alphabetrank->rank(c,i)-1;
712 c = alphabetrank->access(i);
717 // Rank among the end-markers in BWT
718 unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
720 // End-marker that we found belongs to the "preceeding" doc in collection:
721 DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
723 docId = emptyTextRank->select0(docId+1);
724 result.push_back(make_pair(docId, dist));
728 TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
729 DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
730 // textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
733 docId = emptyTextRank->select0(docId+1);
734 result.push_back(make_pair(docId, textPos));
743 * Save index to a file handle
745 * Throws a std::runtime_error exception on i/o error.
746 * First byte that is saved represents the version number of the save file.
747 * In version 2 files, the data fields are:
752 TextPosition bwtEndPos;
753 static_sequence * alphabetrank;
755 BlockArray *suffixes;
756 BlockArray *suffixDocId;
757 unsigned numberOfTexts;
758 unsigned numberOfAllTexts;
760 BlockArray *endmarkerDocId;
761 BSGAP *emptyTextRank;
763 void CSA::Save(FILE *file) const
765 // Saving version info:
766 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
767 throw std::runtime_error("CSA::Save(): file write error (version flag).");
769 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
770 throw std::runtime_error("CSA::Save(): file write error (n).");
771 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
772 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
774 for(ulong i = 0; i < 256; ++i)
775 if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
776 throw std::runtime_error("CSA::Save(): file write error (C table).");
778 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
779 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
781 alphabetrank->save(file);
783 suffixes->Save(file);
784 suffixDocId->Save(file);
786 if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
787 throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
788 if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
789 throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
790 if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
791 throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
793 endmarkerDocId->Save(file);
794 emptyTextRank->Save(file);
800 * Load index from a file handle
802 * Throws a std::runtime_error exception on i/o error.
803 * For more info, see CSA::Save().
805 void CSA::Load(FILE *file, unsigned samplerate)
809 delete dynFMI; dynFMI = 0;
811 delete alphabetrank; alphabetrank = 0;
812 delete sampled; sampled = 0;
813 delete suffixes; suffixes = 0;
814 delete suffixDocId; suffixDocId = 0;
815 delete [] codetable; codetable = 0;
817 delete endmarkerDocId; endmarkerDocId = 0;
818 delete emptyTextRank; emptyTextRank = 0;
820 this->maxTextLength = 0;
821 this->numberOfTexts = 0;
822 this->numberOfAllTexts = 0;
823 this->samplerate = samplerate;
827 if (std::fread(&verFlag, 1, 1, file) != 1)
828 throw std::runtime_error("CSA::Load(): file read error (version flag).");
829 if (verFlag != CSA::versionFlag)
830 throw std::runtime_error("CSA::Load(): invalid save file version.");
832 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
833 throw std::runtime_error("CSA::Load(): file read error (n).");
834 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
835 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
836 // FIXME samplerate can not be changed during load.
837 // if (this->samplerate == 0)
838 // this->samplerate = samplerate;
840 for(ulong i = 0; i < 256; ++i)
841 if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
842 throw std::runtime_error("CSA::Load(): file read error (C table).");
844 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
845 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
847 alphabetrank = static_sequence::load(file);
848 sampled = new BSGAP(file);
849 suffixes = new BlockArray(file);
850 suffixDocId = new BlockArray(file);
852 if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
853 throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
854 if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
855 throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
856 if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
857 throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
859 endmarkerDocId = new BlockArray(file);
860 emptyTextRank = new BSGAP(file);
862 // FIXME Construct data structures with new samplerate
869 * Rest of the functions follow...
873 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
875 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
876 int c = (int)pattern[m-1];
878 TextPosition sp = C[c];
879 TextPosition ep = C[c+1]-1;
880 while (sp<=ep && i>=1)
882 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
883 c = (int)pattern[--i];
884 sp = C[c]+alphabetrank->rank(c,sp-1);
885 ep = C[c]+alphabetrank->rank(c,ep)-1;
903 delete [] codetable; // FIXME remove
905 delete endmarkerDocId;
906 delete emptyTextRank;
909 void CSA::makewavelet(uchar *bwt)
918 if (C[i]>0) {min = i; break;}
919 for (i=255;i>=min;--i)
920 if (C[i]>0) {max = i; break;}
922 ulong prev=C[0], temp;
924 for (i=1;i<256;i++) {
929 // this->codetable = node::makecodetable(bwt,n);
930 // alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
932 //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
934 // FIXME: static_sequence_wvtree accepts only uint arrays
935 uint *text = new uint[n];
936 for (i = 0; i < n; ++i) // Silly
941 alphabet_mapper * am = new alphabet_mapper_none();
942 static_bitsequence_builder * bmb = new static_bitsequence_builder_brw32(16); // FIXME samplerate?
943 wt_coder * wtc = new wt_coder_huff(text,n,am);
944 alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
950 void CSA::maketables()
952 // Calculate BWT end-marker position (of last inserted text)
953 // Note: bwtEndPos already set!
954 // and the length of the first text (counter l):
959 uchar c = alphabetrank->access(i);
962 i = C[c]+alphabetrank->rank(c, i)-1;
964 c = alphabetrank->access(i);
966 assert(i == bwtEndPos); // compare result
969 // Build up arrays for text length and starting positions
970 // FIXME Temp, remove
971 //BlockArray* textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
972 BlockArray* textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
973 //(*textLength)[0] = l;
974 (*textStartPos)[0] = 0;
977 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
978 unsigned ceilLog2n = Tools::CeilLog2(n);
979 BlockArray* positions = new BlockArray(sampleLength, ceilLog2n);
980 BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
982 // Mapping from end-markers to doc ID's:
983 endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
985 ulong *sampledpositions = new ulong[n/W+1];
986 for (ulong i=0;i<n/W+1;i++)
987 sampledpositions[i]=0lu;
990 ulong sampleCount = 0;
991 // Keeping track of text position of prev. end-marker seen
992 ulong posOfSuccEndmarker = n;
993 DocId textId = numberOfTexts;
998 for (ulong i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
999 // i substitutes SA->GetPos(i)
1002 if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
1003 Tools::SetField(sampledpositions,1,p,1);
1004 (*positions)[sampleCount] = p;
1005 (*tmpSuffix)[sampleCount] = x; // FIXME remove
1009 uchar c = alphabetrank->access(p);
1014 // Record the order of end-markers in BWT:
1015 ulong endmarkerRank = alphabetrank->rank(0, p) - 1;
1016 (*endmarkerDocId)[endmarkerRank] = textId;
1018 // Store text length and text start position:
1019 if (textId < (DocId)numberOfTexts - 1)
1021 //(*textLength)[textId + 1] = posOfSuccEndmarker - x;
1022 (*textStartPos)[textId + 1] = x; // x is the position of end-marker.
1023 posOfSuccEndmarker = x;
1026 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
1027 p = textId; // Correct LF-mapping to the last char of the previous text.
1030 p = C[c]+alphabetrank->rank(c, p)-1;
1032 assert(textId == 0);
1034 sampled = new BSGAP(sampledpositions,n,true);
1035 sampleLength = sampled->rank(n-1);
1036 assert(sampleCount == sampleLength);
1038 // Suffixes store an offset from the text start position
1039 suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
1040 suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
1042 for(ulong i=0; i<sampleLength; i++) {
1043 assert((*positions)[i] < n);
1044 ulong j = sampled->rank((*positions)[i]);
1045 if (j==0) j=sampleLength;
1046 TextPosition textPos = (*tmpSuffix)[i];
1047 (*suffixDocId)[j-1] = DocIdAtTextPos(textStartPos, textPos);
1049 assert((unsigned)DocIdAtTextPos(textStartPos, textPos) < numberOfTexts);
1050 assert((*suffixDocId)[j-1] < numberOfTexts);
1051 // calculate offset from text start:
1052 (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
1054 // FIXME Temp, remove
1057 // delete textLength;
1058 delete textStartPos;
1063 * Finds document identifier for given text position
1065 * Starting text position of the document is stored into second parameter.
1066 * Binary searching on text starting positions.
1068 TextCollection::DocId CSA::DocIdAtTextPos(BlockArray* textStartPos, TextPosition i) const
1073 DocId b = numberOfTexts - 1;
1076 DocId c = a + (b - a)/2;
1077 if ((*textStartPos)[c] > i)
1079 else if ((*textStartPos)[c+1] > i)
1085 assert(a < (DocId)numberOfTexts);
1086 assert(i >= (*textStartPos)[a]);
1087 assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
1091 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1093 TCodeEntry *result = new TCodeEntry[ 256 ];
1095 count_chars( text, n, result );
1096 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1098 // First I push all the leaf nodes into the queue
1100 for ( unsigned int i = 0 ; i < 256 ; i++ )
1101 if ( result[ i ].count )
1102 q.push(node( i, result[ i ].count ) );
1104 // This loop removes the two smallest nodes from the
1105 // queue. It creates a new internal node that has
1106 // those two nodes as children. The new internal node
1107 // is then inserted into the priority queue. When there
1108 // is only one node in the priority queue, the tree
1112 while ( q.size() > 1 ) {
1113 node *child0 = new node( q.top() );
1115 node *child1 = new node( q.top() );
1117 q.push( node( child0, child1 ) );
1120 // Now I compute and return the codetable
1122 q.top().maketable(0u,0u, result);
1128 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1132 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1133 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1139 codetable[value].code = code;
1140 codetable[value].bits = bits;
1144 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1147 for (i = 0 ; i < 256 ; i++ )
1148 counts[ i ].count = 0;
1150 counts[(int)text[i]].count++;
1153 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1154 return x | (bit << pos);