1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
30 #include <cstring> // For strlen()
36 using SXSI::TextCollection;
39 ////////////////////////////////////////////////////////////////////////////
40 // Class CSA::THuffAlphabetRank
42 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
48 this->codetable = codetable;
50 bool *B = new bool[n];
53 for (i=0; i< n; i++) {
54 printf("%c:", (char)((int)s[i]-128));
55 for (r=0;r<codetable[(int)s[i]].bits;r++)
56 if (codetable[(int)s[i]].code & (1u <<r))
62 if (level > 100) return;*/
64 if (codetable[(int)s[i]].code & (1u << level)) {
69 if (sum==0 || sum==n) {
74 uchar *sfirst, *ssecond;
76 sfirst = new uchar[n-sum];
78 ssecond = new uchar[sum];
81 if (B[i]) ssecond[k++] = s[i];
82 else sfirst[j++] = s[i];
83 ulong *Binbits = new ulong[n/W+1];
85 Tools::SetField(Binbits,1,i,B[i]);
87 bitrank = new BitRank(Binbits,n,true);
89 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
93 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
99 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
100 // testing that the code works correctly
108 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
110 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
116 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
117 if (left!=NULL) delete left;
118 if (right!=NULL) delete right;
124 ////////////////////////////////////////////////////////////////////////////
128 * Constructor inits an empty dynamic FM-index.
129 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
131 CSA::CSA(unsigned samplerate)
132 : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
133 codetable(0), dynFMI(0)
135 this->samplerate = samplerate;
137 // FIXME TODO : DynFMI needs distribution of characters before hand
138 // This will create fully balanced wavelet tree for all chars [0, 255].
140 for (unsigned i = 0; i < 255; ++i)
143 dynFMI = new DynFMI(temp, 1, 255, false);
149 * Given text must include end-marker.
150 * Text identifiers are numbered starting from 0.
152 void CSA::InsertText(uchar const * text)
157 TextPosition m = std::strlen((char *)text) + 1;
158 // Store text length and starting position
159 textLength.push_back(make_pair(m, n));
161 dynFMI->addText(text, m);
164 void CSA::MakeStatic()
169 std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
173 uchar *bwt = dynFMI->getBWT();
174 /*printf("1234567890123456789\n");
175 for (TextPosition i = 0; i < n; i ++)
177 printf("%d", (int)bwt[i]);
179 printf("%c", bwt[i]);
183 /* for (TextPosition i = 1; i <= n; i ++)
185 printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
189 assert(textLength.size() == dynFMI->getCollectionSize());
196 // Calculate BWT end-marker position (of last inserted text)
198 while (bwt[i] != '\0')
201 i = C[c]+alphabetrank->rank(c, i)-1;
204 //printf("bwtEndPos = %lu\n", bwtEndPos);
208 // Make sampling tables
213 uchar* CSA::GetText(DocId k) const
215 assert(k < (DocId)textLength.size());
217 uchar* result = new uchar[textLength[k].first];
219 TextPosition j = textLength[k].first-1;
222 uchar c = alphabetrank->charAtPos(i);
227 i = C[c]+alphabetrank->rank(c,i)-1;
229 c = alphabetrank->charAtPos(i); // "next" char.
235 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
237 assert(k < (DocId)textLength.size());
238 assert(j < textLength[k].first);
241 // Start position of k'th text
242 ulong start = textLength[k].second;
244 return Substring(i + start, j-i+1);
248 * Existential queries
250 bool CSA::IsPrefix(uchar const * pattern) const
252 TextPosition m = strlen((char *)pattern);
254 return textLength.size();
256 TextPosition sp = 0, ep = 0;
257 Search(pattern, m, &sp, &ep);
259 // Check for end-marker(s) in result interval
260 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
261 it = endmarkers.lower_bound(sp);
262 if (it != endmarkers.end() && it->first <= ep)
267 bool CSA::IsSuffix(uchar const *pattern) const
269 // Here counting is as fast as isSuffix():
270 if (CountSuffix(pattern) > 0)
275 bool CSA::IsEqual(uchar const *pattern) const
277 TextPosition m = std::strlen((char *)pattern);
279 return textLength.size();
281 TextPosition sp = 0, ep = 0;
282 // Match including end-marker
283 Search(pattern, m+1, &sp, &ep);
285 // Check for end-marker(s) in result interval
286 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
287 it = endmarkers.lower_bound(sp);
288 if (it != endmarkers.end() && it->first <= ep)
293 bool CSA::IsContains(uchar const * pattern) const
295 TextPosition m = strlen((char *)pattern);
299 TextPosition sp = 0, ep = 0;
300 // Just check if pattern exists somewhere
301 ulong count = Search(pattern, m, &sp, &ep);
308 bool CSA::IsLessThan(uchar const*) const
317 unsigned CSA::CountPrefix(uchar const * pattern) const
319 TextPosition m = strlen((char *)pattern);
321 return textLength.size();
323 TextPosition sp = 0, ep = 0;
324 Search(pattern, m, &sp, &ep);
326 // Count end-markers in result interval
327 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
328 it = endmarkers.lower_bound(sp);
330 while (it != endmarkers.end() && it->first <= ep)
339 unsigned CSA::CountSuffix(uchar const * pattern) const
341 TextPosition m = strlen((char *)pattern);
343 return textLength.size();
345 TextPosition sp = 0, ep = 0;
346 // Search with end-marker
347 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
352 unsigned CSA::CountEqual(uchar const *pattern) const
354 TextPosition m = strlen((char const *)pattern);
356 return textLength.size();
358 TextPosition sp = 0, ep = 0;
359 // Match including end-marker
360 Search(pattern, m+1, &sp, &ep);
362 // Count end-markers in result interval
363 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
364 it = endmarkers.lower_bound(sp);
366 while (it != endmarkers.end() && it->first <= ep)
375 unsigned CSA::CountContains(uchar const * pattern) const
377 // Here counting is as slow as fetching the result set
378 // because we would have to filter out occ's that fall within same document.
379 TextCollection::document_result result = Contains(pattern);
380 return result.size();
383 unsigned CSA::CountLessThan(uchar const * pattern) const
390 * Document reporting queries
392 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
394 TextPosition m = strlen((char *)pattern);
396 return TextCollection::document_result();
398 TextPosition sp = 0, ep = 0;
399 Search(pattern, m, &sp, &ep);
401 TextCollection::document_result result;
403 // Report end-markers in result interval
404 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
405 it = endmarkers.lower_bound(sp);
407 while (it != endmarkers.end() && it->first <= ep)
409 // End-marker we found belongs to the "previous" doc in the collection
410 DocId docId = ((it->second).first + 1) % textLength.size();
411 result.push_back(docId);
419 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
421 TextPosition m = strlen((char *)pattern);
423 return TextCollection::document_result();
425 TextPosition sp = 0, ep = 0;
426 // Search with end-marker
427 Search(pattern, m+1, &sp, &ep);
429 TextCollection::document_result result;
431 // Check each occurrence
432 for (; sp <= ep; ++sp)
435 uchar c = alphabetrank->charAtPos(i);
436 while (c != '\0' && !sampled->IsBitSet(i))
438 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
439 c = alphabetrank->charAtPos(i);
443 // map::operator[] is not const, using find() instead:
444 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
445 // End-marker that we found belongs to the "previous" doc in collection:
446 DocId docId = (endm.first + 1) % textLength.size();
447 result.push_back(docId);
451 TextPosition textPos = suffixes[sampled->rank(i)-1];
452 result.push_back(DocIdAtTextPos(textPos));
459 TextCollection::document_result CSA::Equal(uchar const *pattern) const
461 TextPosition m = strlen((char const *)pattern);
463 return TextCollection::document_result();
465 TextPosition sp = 0, ep = 0;
466 // Match including end-marker
467 Search(pattern, m+1, &sp, &ep);
469 TextCollection::document_result result;
471 // Report end-markers in result interval
472 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
473 it = endmarkers.lower_bound(sp);
475 while (it != endmarkers.end() && it->first <= ep)
477 // End-marker that we found belongs to the "previous" doc in collection:
478 DocId docId = ((it->second).first + 1) % textLength.size();
479 result.push_back(docId);
488 TextCollection::document_result CSA::Contains(uchar const * pattern) const
490 TextPosition m = strlen((char *)pattern);
492 return TextCollection::document_result();
494 TextPosition sp = 0, ep = 0;
495 // Search all occurrences
496 Search(pattern, m, &sp, &ep);
498 // We want unique document indentifiers, using std::set to collect them
499 std::set<DocId> resultSet;
501 // Check each occurrence
502 for (; sp <= ep; ++sp)
505 uchar c = alphabetrank->charAtPos(i);
506 while (c != '\0' && !sampled->IsBitSet(i))
508 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
509 c = alphabetrank->charAtPos(i);
513 // map::operator[] is not const, using find() instead:
514 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
515 // End-marker we found belongs to the "previous" doc in collection
516 DocId docId = (endm.first + 1) % textLength.size();
517 resultSet.insert(docId);
521 TextPosition textPos = suffixes[sampled->rank(i)-1];
522 resultSet.insert(DocIdAtTextPos(textPos));
526 // Convert std::set to std::vector (TODO better way to construct result vector?)
527 TextCollection::document_result result;
528 for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
529 result.push_back(*it);
533 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
536 return document_result();
540 * Full result set queries
542 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
544 TextPosition m = strlen((char *)pattern);
546 return full_result();
548 TextPosition sp = 0, ep = 0;
549 // Search all occurrences
550 Search(pattern, m, &sp, &ep);
554 // Report each occurrence
555 for (; sp <= ep; ++sp)
558 TextPosition dist = 0;
559 uchar c = alphabetrank->charAtPos(i);
560 while (c != '\0' && !sampled->IsBitSet(i))
562 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
563 c = alphabetrank->charAtPos(i);
568 // map::operator[] is not const, using find() instead:
569 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
570 // End-marker we found belongs to the "previous" doc in the collection:
571 DocId docId = (endm.first+1)%textLength.size();
572 result.push_back(make_pair(docId, dist));
576 TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
577 // Read document identifier and its starting position in text order
578 DocId docId = DocIdAtTextPos(textPos);
579 result.push_back(make_pair(docId, textPos - textLength[docId].second));
588 * Finds document identifier for given text position
590 * Starting text position of the document is stored into second parameter.
591 * Binary searching on text starting positions.
593 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
598 DocId b = textLength.size() - 1;
601 DocId c = a + (b - a)/2;
602 if (textLength[c].second > i)
604 else if (textLength[c+1].second > i)
610 assert(a < (DocId)textLength.size());
611 assert(i > textLength[a].second);
612 assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
617 * Save index to a file handle
619 * Throws a std::runtime_error exception on i/o error.
620 * First byte that is saved represents the version number of the save file.
621 * In version 1 files, the data fields are:
622 * <uchar version> version info;
623 * <unsigned s> samplerate;
624 * <ulong n> length of the BWT;
625 * <ulong bwt> end marker position in BWT;
626 * <uchar *> BWT string of length n;
627 * <unsigned r> number of texts;
628 * <vector textLength>
629 * array of <ulong, ulong> pairs.
631 * TODO: Save the data structures instead of BWT sequence?
633 void CSA::Save(FILE *file) const
635 // Saving version 1 data:
636 uchar versionFlag = 1;
637 if (std::fwrite(&versionFlag, 1, 1, file) != 1)
638 throw std::runtime_error("CSA::Save(): file write error (version flag).");
640 if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
641 throw std::runtime_error("CSA::Save(): file write error (samplerate).");
643 if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
644 throw std::runtime_error("CSA::Save(): file write error (n).");
646 if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
647 throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
649 for (ulong offset = 0; offset < n; offset ++)
651 uchar c = alphabetrank->charAtPos(offset);
652 if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
653 throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
656 unsigned r = textLength.size();
657 if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
658 throw std::runtime_error("CSA::Save(): file write error (r).");
660 for (r = 0; r < textLength.size(); ++ r)
662 if (std::fwrite(&(textLength[r].first), sizeof(TextPosition), 1, file) != 1)
663 throw std::runtime_error("CSA::Save(): file write error (text length).");
664 if (std::fwrite(&(textLength[r].second), sizeof(TextPosition), 1, file) != 1)
665 throw std::runtime_error("CSA::Save(): file write error (text start).");
671 * Load index from a file handle
673 * Throws a std::runtime_error exception on i/o error.
674 * For more info, see CSA::Save().
676 void CSA::Load(FILE *file, unsigned samplerate)
679 delete dynFMI; dynFMI = 0;
680 delete alphabetrank; alphabetrank = 0;
681 delete sampled; sampled = 0;
682 delete [] suffixes; suffixes = 0;
683 delete [] positions; positions = 0;
684 delete [] codetable; codetable = 0;
689 this->samplerate = samplerate;
692 uchar versionFlag = 0;
693 if (std::fread(&versionFlag, 1, 1, file) != 1)
694 throw std::runtime_error("CSA::Load(): file read error (version flag).");
695 if (versionFlag != 1)
696 throw std::runtime_error("CSA::Load(): invalid start byte.");
698 if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
699 throw std::runtime_error("CSA::Load(): file read error (samplerate).");
700 if (this->samplerate == 0)
701 this->samplerate = samplerate;
703 if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
704 throw std::runtime_error("CSA::Load(): file read error (n).");
706 if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
707 throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
709 uchar *bwt = new uchar[n];
710 for (ulong offset = 0; offset < n; offset ++)
711 if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
712 throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
715 if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
716 throw std::runtime_error("CSA::Load(): file read error (r).");
720 TextPosition length = 0, start = 0;
721 if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
722 throw std::runtime_error("CSA::Load(): file read error (text length).");
723 if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
724 throw std::runtime_error("CSA::Load(): file read error (text start).");
726 textLength.push_back(make_pair(length, start));
730 // Construct data structures
739 * Rest of the functions follow...
742 uchar * CSA::Substring(TextPosition i, TextPosition l) const
744 uchar *result = new uchar[l + 1];
752 TextPosition k = i + l - 1;
753 // Check for end of the string
760 TextPosition skip = samplerate - k % samplerate - 1;
762 if (k / samplerate + 1 >= n / samplerate)
769 j = positions[k/samplerate+1];
770 //cout << samplerate << ' ' << j << '\n';
773 for (dist = 0; dist < skip + l; dist++)
775 int c = alphabetrank->charAtPos(j);
778 // map::operator[] is not const, using find() instead:
779 pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
780 j = endm.first; // LF-mapping for end-marker
783 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
785 result[l + skip - dist - 1] = c;
791 /*TextPosition CSA::inverse(TextPosition i)
793 TextPosition skip = samplerate - i % samplerate;
795 if (i / samplerate + 1 >= n / samplerate)
802 j = positions[i/samplerate+1];
803 //cout << samplerate << ' ' << j << '\n';
808 int c = alphabetrank->charAtPos(j);
809 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
815 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
816 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
817 int c = (int)pattern[m-1];
819 TextPosition sp = C[c];
820 TextPosition ep = C[c+1]-1;
821 while (sp<=ep && i>=1)
823 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
824 c = (int)pattern[--i];
825 sp = C[c]+alphabetrank->rank(c,sp-1);
826 ep = C[c]+alphabetrank->rank(c,ep)-1;
836 /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
838 sp = C[(int)c]+alphabetrank->rank(c,sp-1);
839 ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
847 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
850 while (!sampled->IsBitSet(i))
852 int c = alphabetrank->charAtPos(i);
855 // map::operator[] is not const, using find() instead:
856 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
857 return endm.second + dist; // returns text position.
859 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
863 return suffixes[sampled->rank(i)-1]+dist;
875 void CSA::makewavelet(uchar *bwt)
884 if (C[i]>0) {min = i; break;}
885 for (i=255;i>=min;--i)
886 if (C[i]>0) {max = i; break;}
889 /* for(i = 0; i < 256; i++)
890 if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
893 ulong prev=C[0], temp;
895 for (i=1;i<256;i++) {
900 this->codetable = node::makecodetable(bwt,n);
901 alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
902 //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
904 /* for (i = 0; i < n; ++i)
906 uchar c = alphabetrank->charAtPos(i);
907 TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
908 printf("LF[%lu] = %lu\n", i, j);
912 void CSA::maketables()
914 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
916 ulong *sampledpositions = new ulong[n/W+1];
917 suffixes = new ulong[sampleLength];
918 positions = new ulong[sampleLength];
921 for (i=0;i<n/W+1;i++)
922 sampledpositions[i]=0lu;
925 DocId textId = textLength.size();
930 for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
931 // i substitutes SA->GetPos(i)
934 if (x % samplerate == 0) {
935 Tools::SetField(sampledpositions,1,p,1);
936 positions[x/samplerate] = p;
939 uchar c = alphabetrank->charAtPos(p);
942 // Record the order of end-markers in BWT:
944 endmarkers[p] = make_pair(textId, (TextPosition)x);
945 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
946 p = textId; // Correct LF-mapping to the last char of the previous text.
949 p = C[c]+alphabetrank->rank(c, p)-1;
952 /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
954 printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
957 sampled = new BitRank(sampledpositions,n,true);
960 for(i=0; i<sampleLength; i++) {
961 j = sampled->rank(positions[i]);
962 if (j==0) j=sampleLength;
963 suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
967 uchar * CSA::LoadFromFile(const char *filename)
970 std::ifstream file (filename, std::ios::in|std::ios::binary);
973 std::cerr << "Loading CSA from file: " << filename << std::endl;
974 file.read((char *)&bwtEndPos, sizeof(TextPosition));
976 for (ulong offset = 0; offset < n; offset ++)
977 file.read((char *)(s + offset), sizeof(char));
982 std::cerr << "Unable to open file " << filename << std::endl;
988 void CSA::SaveToFile(const char *filename, uchar *bwt)
990 std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
993 std::cerr << "Writing CSA to file: " << filename << std::endl;
994 file.write((char *)&bwtEndPos, sizeof(TextPosition));
995 std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
996 for (ulong offset = 0; offset < n; offset ++)
997 file.write((char *)(bwt + offset), sizeof(char));
1002 std::cerr << "Unable to open file " << filename << std::endl;
1007 /*uchar * CSA::BWT(uchar *text)
1011 DynFMI *wt = new DynFMI((uchar *) text, n);
1013 for (ulong i=0;i<n;i++)
1015 bwtEndPos = i; // TODO: better solution ?
1023 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
1025 TCodeEntry *result = new TCodeEntry[ 256 ];
1027 count_chars( text, n, result );
1028 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
1030 // First I push all the leaf nodes into the queue
1032 for ( unsigned int i = 0 ; i < 256 ; i++ )
1033 if ( result[ i ].count )
1034 q.push(node( i, result[ i ].count ) );
1036 // This loop removes the two smallest nodes from the
1037 // queue. It creates a new internal node that has
1038 // those two nodes as children. The new internal node
1039 // is then inserted into the priority queue. When there
1040 // is only one node in the priority queue, the tree
1044 while ( q.size() > 1 ) {
1045 node *child0 = new node( q.top() );
1047 node *child1 = new node( q.top() );
1049 q.push( node( child0, child1 ) );
1052 // Now I compute and return the codetable
1054 q.top().maketable(0u,0u, result);
1060 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
1064 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
1065 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
1071 codetable[value].code = code;
1072 codetable[value].bits = bits;
1076 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
1079 for (i = 0 ; i < 256 ; i++ )
1080 counts[ i ].count = 0;
1082 counts[(int)text[i]].count++;
1085 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
1086 return x | (bit << pos);