1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
30 #include <cstring> // For strlen()
36 using SXSI::TextCollection;
39 ////////////////////////////////////////////////////////////////////////////
40 // Class CSA::THuffAlphabetRank
42 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
48 this->codetable = codetable;
50 bool *B = new bool[n];
53 for (i=0; i< n; i++) {
54 printf("%c:", (char)((int)s[i]-128));
55 for (r=0;r<codetable[(int)s[i]].bits;r++)
56 if (codetable[(int)s[i]].code & (1u <<r))
62 if (level > 100) return;*/
64 if (codetable[(int)s[i]].code & (1u << level)) {
69 if (sum==0 || sum==n) {
74 uchar *sfirst, *ssecond;
76 sfirst = new uchar[n-sum];
78 ssecond = new uchar[sum];
81 if (B[i]) ssecond[k++] = s[i];
82 else sfirst[j++] = s[i];
83 ulong *Binbits = new ulong[n/W+1];
85 Tools::SetField(Binbits,1,i,B[i]);
87 bitrank = new BitRank(Binbits,n,true);
89 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
93 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
99 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
100 // testing that the code works correctly
108 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
110 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
116 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
117 if (left!=NULL) delete left;
118 if (right!=NULL) delete right;
124 ////////////////////////////////////////////////////////////////////////////
128 * Constructor inits an empty dynamic FM-index.
129 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
131 CSA::CSA(unsigned samplerate)
132 : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
133 codetable(0), dynFMI(0)
135 this->samplerate = samplerate;
137 // FIXME TODO : DynFMI needs distribution of characters before hand
138 // This will create fully balanced wavelet tree for all chars [0, 255].
140 for (unsigned i = 0; i < 255; ++i)
143 dynFMI = new DynFMI(temp, 1, 255, false);
149 * Given text must include end-marker.
150 * Text identifiers are numbered starting from 0.
152 void CSA::InsertText(uchar const * text)
157 TextPosition m = std::strlen((char *)text) + 1;
158 // Store text length and starting position
159 textLength.push_back(make_pair(m, n));
161 dynFMI->addText(text, m);
164 void CSA::MakeStatic()
169 std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
173 uchar *bwt = dynFMI->getBWT();
174 /*printf("1234567890123456789\n");
175 for (TextPosition i = 0; i < n; i ++)
177 printf("%d", (int)bwt[i]);
179 printf("%c", bwt[i]);
183 /* for (TextPosition i = 1; i <= n; i ++)
185 printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
189 assert(textLength.size() == dynFMI->getCollectionSize());
196 // Calculate BWT end-marker position (of last inserted text)
198 while (bwt[i] != '\0')
201 i = C[c]+alphabetrank->rank(c, i)-1;
204 //printf("bwtEndPos = %lu\n", bwtEndPos);
208 // Make sampling tables
212 bool CSA::EmptyText(DocId k) const
214 assert(k < (DocId)textLength.size());
215 return (1 == textLength[k].first);
218 uchar* CSA::GetText(DocId k) const
220 assert(k < (DocId)textLength.size());
222 uchar* result = new uchar[textLength[k].first];
224 TextPosition j = textLength[k].first-1;
227 uchar c = alphabetrank->charAtPos(i);
232 i = C[c]+alphabetrank->rank(c,i)-1;
234 c = alphabetrank->charAtPos(i); // "next" char.
240 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
242 assert(k < (DocId)textLength.size());
243 assert(j < textLength[k].first);
246 // Start position of k'th text
247 ulong start = textLength[k].second;
249 return Substring(i + start, j-i+1);
253 * Existential queries
255 bool CSA::IsPrefix(uchar const * pattern) const
257 TextPosition m = strlen((char *)pattern);
259 return textLength.size();
261 TextPosition sp = 0, ep = 0;
262 Search(pattern, m, &sp, &ep);
264 // Check for end-marker(s) in result interval
265 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
266 it = endmarkers.lower_bound(sp);
267 if (it != endmarkers.end() && it->first <= ep)
272 bool CSA::IsSuffix(uchar const *pattern) const
274 // Here counting is as fast as isSuffix():
275 if (CountSuffix(pattern) > 0)
280 bool CSA::IsEqual(uchar const *pattern) const
282 TextPosition m = std::strlen((char *)pattern);
284 return textLength.size();
286 TextPosition sp = 0, ep = 0;
287 // Match including end-marker
288 Search(pattern, m+1, &sp, &ep);
290 // Check for end-marker(s) in result interval
291 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
292 it = endmarkers.lower_bound(sp);
293 if (it != endmarkers.end() && it->first <= ep)
298 bool CSA::IsContains(uchar const * pattern) const
300 TextPosition m = strlen((char *)pattern);
304 TextPosition sp = 0, ep = 0;
305 // Just check if pattern exists somewhere
306 ulong count = Search(pattern, m, &sp, &ep);
313 bool CSA::IsLessThan(uchar const*) const
322 unsigned CSA::CountPrefix(uchar const * pattern) const
324 TextPosition m = strlen((char *)pattern);
326 return textLength.size();
328 TextPosition sp = 0, ep = 0;
329 Search(pattern, m, &sp, &ep);
331 // Count end-markers in result interval
332 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
333 it = endmarkers.lower_bound(sp);
335 while (it != endmarkers.end() && it->first <= ep)
344 unsigned CSA::CountSuffix(uchar const * pattern) const
346 TextPosition m = strlen((char *)pattern);
348 return textLength.size();
350 TextPosition sp = 0, ep = 0;
351 // Search with end-marker
352 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
357 unsigned CSA::CountEqual(uchar const *pattern) const
359 TextPosition m = strlen((char const *)pattern);
361 return textLength.size();
363 TextPosition sp = 0, ep = 0;
364 // Match including end-marker
365 Search(pattern, m+1, &sp, &ep);
367 // Count end-markers in result interval
368 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
369 it = endmarkers.lower_bound(sp);
371 while (it != endmarkers.end() && it->first <= ep)
380 unsigned CSA::CountContains(uchar const * pattern) const
382 // Here counting is as slow as fetching the result set
383 // because we would have to filter out occ's that fall within same document.
384 TextCollection::document_result result = Contains(pattern);
385 return result.size();
388 unsigned CSA::CountLessThan(uchar const * pattern) const
395 * Document reporting queries
397 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
399 TextPosition m = strlen((char *)pattern);
401 return TextCollection::document_result();
403 TextPosition sp = 0, ep = 0;
404 Search(pattern, m, &sp, &ep);
406 TextCollection::document_result result;
408 // Report end-markers in result interval
409 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
410 it = endmarkers.lower_bound(sp);
412 while (it != endmarkers.end() && it->first <= ep)
414 // End-marker we found belongs to the "previous" doc in the collection
415 DocId docId = ((it->second).first + 1) % textLength.size();
416 result.push_back(docId);
424 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
426 TextPosition m = strlen((char *)pattern);
428 return TextCollection::document_result();
430 TextPosition sp = 0, ep = 0;
431 // Search with end-marker
432 Search(pattern, m+1, &sp, &ep);
434 TextCollection::document_result result;
436 // Check each occurrence
437 for (; sp <= ep; ++sp)
440 uchar c = alphabetrank->charAtPos(i);
441 while (c != '\0' && !sampled->IsBitSet(i))
443 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
444 c = alphabetrank->charAtPos(i);
448 // map::operator[] is not const, using find() instead:
449 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
450 // End-marker that we found belongs to the "previous" doc in collection:
451 DocId docId = (endm.first + 1) % textLength.size();
452 result.push_back(docId);
456 TextPosition textPos = suffixes[sampled->rank(i)-1];
457 result.push_back(DocIdAtTextPos(textPos));
464 TextCollection::document_result CSA::Equal(uchar const *pattern) const
466 TextPosition m = strlen((char const *)pattern);
468 return TextCollection::document_result();
470 TextPosition sp = 0, ep = 0;
471 // Match including end-marker
472 Search(pattern, m+1, &sp, &ep);
474 TextCollection::document_result result;
476 // Report end-markers in result interval
477 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
478 it = endmarkers.lower_bound(sp);
480 while (it != endmarkers.end() && it->first <= ep)
482 // End-marker that we found belongs to the "previous" doc in collection:
483 DocId docId = ((it->second).first + 1) % textLength.size();
484 result.push_back(docId);
493 TextCollection::document_result CSA::Contains(uchar const * pattern) const
495 TextPosition m = strlen((char *)pattern);
497 return TextCollection::document_result();
499 TextPosition sp = 0, ep = 0;
500 // Search all occurrences
501 Search(pattern, m, &sp, &ep);
503 // We want unique document indentifiers, using std::set to collect them
504 std::set<DocId> resultSet;
506 // Check each occurrence
507 for (; sp <= ep; ++sp)
510 uchar c = alphabetrank->charAtPos(i);
511 while (c != '\0' && !sampled->IsBitSet(i))
513 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
514 c = alphabetrank->charAtPos(i);
518 // map::operator[] is not const, using find() instead:
519 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
520 // End-marker we found belongs to the "previous" doc in collection
521 DocId docId = (endm.first + 1) % textLength.size();
522 resultSet.insert(docId);
526 TextPosition textPos = suffixes[sampled->rank(i)-1];
527 resultSet.insert(DocIdAtTextPos(textPos));
531 // Convert std::set to std::vector (TODO better way to construct result vector?)
532 TextCollection::document_result result;
533 for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
534 result.push_back(*it);
538 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
541 return document_result();
545 * Full result set queries
547 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
549 TextPosition m = strlen((char *)pattern);
551 return full_result();
553 TextPosition sp = 0, ep = 0;
554 // Search all occurrences
555 Search(pattern, m, &sp, &ep);
559 // Report each occurrence
560 for (; sp <= ep; ++sp)
563 TextPosition dist = 0;
564 uchar c = alphabetrank->charAtPos(i);
565 while (c != '\0' && !sampled->IsBitSet(i))
567 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
568 c = alphabetrank->charAtPos(i);
573 // map::operator[] is not const, using find() instead:
574 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
575 // End-marker we found belongs to the "previous" doc in the collection:
576 DocId docId = (endm.first+1)%textLength.size();
577 result.push_back(make_pair(docId, dist));
581 TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
582 // Read document identifier and its starting position in text order
583 DocId docId = DocIdAtTextPos(textPos);
584 result.push_back(make_pair(docId, textPos - textLength[docId].second));
593 * Finds document identifier for given text position
595 * Starting text position of the document is stored into second parameter.
596 * Binary searching on text starting positions.
598 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
603 DocId b = textLength.size() - 1;
606 DocId c = a + (b - a)/2;
607 if (textLength[c].second > i)
609 else if (textLength[c+1].second > i)
615 assert(a < (DocId)textLength.size());
616 assert(i > textLength[a].second);
617 assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
622 * Save index to a file handle
624 * Throws a std::runtime_error exception on i/o error.
625 * First byte that is saved represents the version number of the save file.
626 * In version 1 files, the data fields are:
627 * <uchar version> version info;
628 * <unsigned s> samplerate;
629 * <ulong n> length of the BWT;
630 * <ulong bwt> end marker position in BWT;
631 * <uchar *> BWT string of length n;
632 * <unsigned r> number of texts;
633 * <vector textLength>
634 * array of <ulong, ulong> pairs.
636 * TODO: Save the data structures instead of BWT sequence?
638 void CSA::Save(FILE *file) const
640 // Saving version 1 data:
641 uchar versionFlag = 1;
642 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
643 throw std::runtime_error("CSA::Save(): file write error (version flag).");
645 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
646 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
648 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
649 throw std::runtime_error("CSA::Save(): file write error (n).");
651 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
652 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
654 for (ulong offset = 0; offset < n; offset ++)
656 uchar c = alphabetrank->charAtPos(offset);
657 if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
658 throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
661 unsigned r = textLength.size();
662 if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
663 throw std::runtime_error("CSA::Save(): file write error (r).");
665 for (r = 0; r < textLength.size(); ++ r)
667 if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1)
668 throw std::runtime_error("CSA::Save(): file write error (text length).");
669 if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1)
670 throw std::runtime_error("CSA::Save(): file write error (text start).");
676 * Load index from a file handle
678 * Throws a std::runtime_error exception on i/o error.
679 * For more info, see CSA::Save().
681 void CSA::Load(FILE *file, unsigned samplerate)
684 delete dynFMI; dynFMI = 0;
685 delete alphabetrank; alphabetrank = 0;
686 delete sampled; sampled = 0;
687 delete [] suffixes; suffixes = 0;
688 delete [] positions; positions = 0;
689 delete [] codetable; codetable = 0;
694 this->samplerate = samplerate;
697 uchar versionFlag = 0;
698 if (std::fread(&versionFlag, 1, 1, file) != 1)
699 throw std::runtime_error("CSA::Load(): file read error (version flag).");
700 if (versionFlag != 1)
701 throw std::runtime_error("CSA::Load(): invalid start byte.");
703 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
704 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
705 if (this->samplerate == 0)
706 this->samplerate = samplerate;
708 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
709 throw std::runtime_error("CSA::Load(): file read error (n).");
711 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
712 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
714 uchar *bwt = new uchar[n];
715 for (ulong offset = 0; offset < n; offset ++)
716 if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
717 throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
720 if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
721 throw std::runtime_error("CSA::Load(): file read error (r).");
725 TextPosition length = 0, start = 0;
726 if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
727 throw std::runtime_error("CSA::Load(): file read error (text length).");
728 if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
729 throw std::runtime_error("CSA::Load(): file read error (text start).");
731 textLength.push_back(make_pair(length, start));
735 // Construct data structures
744 * Rest of the functions follow...
747 uchar * CSA::Substring(TextPosition i, TextPosition l) const
749 uchar *result = new uchar[l + 1];
757 TextPosition k = i + l - 1;
758 // Check for end of the string
765 TextPosition skip = samplerate - k % samplerate - 1;
767 if (k / samplerate + 1 >= n / samplerate)
774 j = positions[k/samplerate+1];
775 //cout << samplerate << ' ' << j << '\n';
778 for (dist = 0; dist < skip + l; dist++)
780 int c = alphabetrank->charAtPos(j);
783 // map::operator[] is not const, using find() instead:
784 pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
785 j = endm.first; // LF-mapping for end-marker
788 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
790 result[l + skip - dist - 1] = c;
796 /*TextPosition CSA::inverse(TextPosition i)
798 TextPosition skip = samplerate - i % samplerate;
800 if (i / samplerate + 1 >= n / samplerate)
807 j = positions[i/samplerate+1];
808 //cout << samplerate << ' ' << j << '\n';
813 int c = alphabetrank->charAtPos(j);
814 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
820 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
821 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
822 int c = (int)pattern[m-1];
824 TextPosition sp = C[c];
825 TextPosition ep = C[c+1]-1;
826 while (sp<=ep && i>=1)
828 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
829 c = (int)pattern[--i];
830 sp = C[c]+alphabetrank->rank(c,sp-1);
831 ep = C[c]+alphabetrank->rank(c,ep)-1;
841 /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
843 sp = C[(int)c]+alphabetrank->rank(c,sp-1);
844 ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
852 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
855 while (!sampled->IsBitSet(i))
857 int c = alphabetrank->charAtPos(i);
860 // map::operator[] is not const, using find() instead:
861 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
862 return endm.second + dist; // returns text position.
864 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
868 return suffixes[sampled->rank(i)-1]+dist;
880 void CSA::makewavelet(uchar *bwt)
889 if (C[i]>0) {min = i; break;}
890 for (i=255;i>=min;--i)
891 if (C[i]>0) {max = i; break;}
894 /* for(i = 0; i < 256; i++)
895 if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
898 ulong prev=C[0], temp;
900 for (i=1;i<256;i++) {
905 this->codetable = node::makecodetable(bwt,n);
906 alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
907 //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
909 /* for (i = 0; i < n; ++i)
911 uchar c = alphabetrank->charAtPos(i);
912 TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
913 printf("LF[%lu] = %lu\n", i, j);
917 void CSA::maketables()
919 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
921 ulong *sampledpositions = new ulong[n/W+1];
922 suffixes = new ulong[sampleLength];
923 positions = new ulong[sampleLength];
926 for (i=0;i<n/W+1;i++)
927 sampledpositions[i]=0lu;
930 DocId textId = textLength.size();
935 for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
936 // i substitutes SA->GetPos(i)
939 if (x % samplerate == 0) {
940 Tools::SetField(sampledpositions,1,p,1);
941 positions[x/samplerate] = p;
944 uchar c = alphabetrank->charAtPos(p);
947 // Record the order of end-markers in BWT:
949 endmarkers[p] = make_pair(textId, (TextPosition)x);
950 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
951 p = textId; // Correct LF-mapping to the last char of the previous text.
954 p = C[c]+alphabetrank->rank(c, p)-1;
957 /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
959 printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
962 sampled = new BitRank(sampledpositions,n,true);
965 for(i=0; i<sampleLength; i++) {
966 j = sampled->rank(positions[i]);
967 if (j==0) j=sampleLength;
968 suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
972 uchar * CSA::LoadFromFile(const char *filename)
975 std::ifstream file (filename, std::ios::in|std::ios::binary);
978 std::cerr << "Loading CSA from file: " << filename << std::endl;
979 file.read((char *)&bwtEndPos, sizeof(TextPosition));
981 for (ulong offset = 0; offset < n; offset ++)
982 file.read((char *)(s + offset), sizeof(char));
987 std::cerr << "Unable to open file " << filename << std::endl;
993 void CSA::SaveToFile(const char *filename, uchar *bwt)
995 std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
998 std::cerr << "Writing CSA to file: " << filename << std::endl;
999 file.write((char *)&bwtEndPos, sizeof(TextPosition));
1000 std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
1001 for (ulong offset = 0; offset < n; offset ++)
1002 file.write((char *)(bwt + offset), sizeof(char));
1007 std::cerr << "Unable to open file " << filename << std::endl;
1012 /*uchar * CSA::BWT(uchar *text)
1016 DynFMI *wt = new DynFMI((uchar *) text, n);
1018 for (ulong i=0;i<n;i++)
1020 bwtEndPos = i; // TODO: better solution ?
1028 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1030 TCodeEntry *result = new TCodeEntry[ 256 ];
1032 count_chars( text, n, result );
1033 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1035 // First I push all the leaf nodes into the queue
1037 for ( unsigned int i = 0 ; i < 256 ; i++ )
1038 if ( result[ i ].count )
1039 q.push(node( i, result[ i ].count ) );
1041 // This loop removes the two smallest nodes from the
1042 // queue. It creates a new internal node that has
1043 // those two nodes as children. The new internal node
1044 // is then inserted into the priority queue. When there
1045 // is only one node in the priority queue, the tree
1049 while ( q.size() > 1 ) {
1050 node *child0 = new node( q.top() );
1052 node *child1 = new node( q.top() );
1054 q.push( node( child0, child1 ) );
1057 // Now I compute and return the codetable
1059 q.top().maketable(0u,0u, result);
1065 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1069 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1070 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1076 codetable[value].code = code;
1077 codetable[value].bits = bits;
1081 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1084 for (i = 0 ; i < 256 ; i++ )
1085 counts[ i ].count = 0;
1087 counts[(int)text[i]].count++;
1090 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1091 return x | (bit << pos);