1 /******************************************************************************
2 * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU Lesser General Public License as published *
7 * by the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU Lesser General Public License for more details. *
15 * You should have received a copy of the GNU Lesser General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 *****************************************************************************/
21 #include "TextCollection.h"
29 #include <cstring> // For strlen()
35 using SXSI::TextCollection;
38 ////////////////////////////////////////////////////////////////////////////
39 // Class CSA::THuffAlphabetRank
41 CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
47 this->codetable = codetable;
49 bool *B = new bool[n];
52 for (i=0; i< n; i++) {
53 printf("%c:", (char)((int)s[i]-128));
54 for (r=0;r<codetable[(int)s[i]].bits;r++)
55 if (codetable[(int)s[i]].code & (1u <<r))
61 if (level > 100) return;*/
63 if (codetable[(int)s[i]].code & (1u << level)) {
68 if (sum==0 || sum==n) {
73 uchar *sfirst, *ssecond;
75 sfirst = new uchar[n-sum];
77 ssecond = new uchar[sum];
80 if (B[i]) ssecond[k++] = s[i];
81 else sfirst[j++] = s[i];
82 ulong *Binbits = new ulong[n/W+1];
84 Tools::SetField(Binbits,1,i,B[i]);
86 bitrank = new BitRank(Binbits,n,true);
88 left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
92 right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
98 bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
99 // testing that the code works correctly
107 if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
109 printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
115 CSA::THuffAlphabetRank::~THuffAlphabetRank() {
116 if (left!=NULL) delete left;
117 if (right!=NULL) delete right;
123 ////////////////////////////////////////////////////////////////////////////
127 * Constructor inits an empty dynamic FM-index.
128 * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
130 CSA::CSA(unsigned samplerate)
131 : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
132 codetable(0), dynFMI(0)
134 this->samplerate = samplerate;
136 // FIXME TODO : DynFMI needs distribution of characters before hand
137 // This will create fully balanced wavelet tree for all chars [0, 255].
139 for (unsigned i = 0; i < 255; ++i)
142 dynFMI = new DynFMI(temp, 1, 255, false);
148 * Given text must include end-marker.
149 * Text identifiers are numbered starting from 0.
151 void CSA::InsertText(uchar const * text)
156 TextPosition m = std::strlen((char *)text) + 1;
157 // Store text length and starting position
158 textLength.push_back(make_pair(m, n));
160 dynFMI->addText(text, m);
163 void CSA::MakeStatic()
168 std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
172 uchar *bwt = dynFMI->getBWT();
173 /*printf("1234567890123456789\n");
174 for (TextPosition i = 0; i < n; i ++)
176 printf("%d", (int)bwt[i]);
178 printf("%c", bwt[i]);
182 /* for (TextPosition i = 1; i <= n; i ++)
184 printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
188 assert(textLength.size() == dynFMI->getCollectionSize());
200 if (C[i]>0) {min = i; break;}
201 for (i=255;i>=min;--i)
202 if (C[i]>0) {max = i; break;}
205 /* for(i = 0; i < 256; i++)
206 if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
209 ulong prev=C[0], temp;
211 for (i=1;i<256;i++) {
216 this->codetable = node::makecodetable(bwt,n);
217 alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
218 //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
220 /* for (i = 0; i < n; ++i)
222 uchar c = alphabetrank->charAtPos(i);
223 TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
224 printf("LF[%lu] = %lu\n", i, j);
227 // Calculate BWT end-marker position (of last inserted text)
229 while (bwt[i] != '\0')
231 uchar c = alphabetrank->charAtPos(i);
232 i = C[c]+alphabetrank->rank(c, i)-1;
235 //printf("bwtEndPos = %lu\n", bwtEndPos);
239 // Make sampling tables
241 // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small
242 // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+...
243 //printf("FMindex takes %d B\n",
244 // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8);
248 uchar* CSA::GetText(DocId k) const
250 assert(k < (DocId)textLength.size());
252 uchar* result = new uchar[textLength[k].first];
254 TextPosition j = textLength[k].first-1;
257 uchar c = alphabetrank->charAtPos(i);
262 i = C[c]+alphabetrank->rank(c,i)-1;
264 c = alphabetrank->charAtPos(i); // "next" char.
270 uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
272 assert(k < (DocId)textLength.size());
273 assert(j < textLength[k].first);
276 // Start position of k'th text
277 ulong start = textLength[k].second;
279 return Substring(i + start, j-i+1);
283 * Existential queries
285 bool CSA::IsPrefix(uchar const * pattern) const
287 TextPosition m = strlen((char *)pattern);
289 return textLength.size();
291 TextPosition sp = 0, ep = 0;
292 Search(pattern, m, &sp, &ep);
294 // Check for end-marker(s) in result interval
295 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
296 it = endmarkers.lower_bound(sp);
297 if (it != endmarkers.end() && it->first <= ep)
302 bool CSA::IsSuffix(uchar const *pattern) const
304 // Here counting is as fast as isSuffix():
305 if (CountSuffix(pattern) > 0)
310 bool CSA::IsEqual(uchar const *pattern) const
312 TextPosition m = std::strlen((char *)pattern);
314 return textLength.size();
316 TextPosition sp = 0, ep = 0;
317 // Match including end-marker
318 Search(pattern, m+1, &sp, &ep);
320 // Check for end-marker(s) in result interval
321 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
322 it = endmarkers.lower_bound(sp);
323 if (it != endmarkers.end() && it->first <= ep)
328 bool CSA::IsContains(uchar const * pattern) const
330 TextPosition m = strlen((char *)pattern);
334 TextPosition sp = 0, ep = 0;
335 // Just check if pattern exists somewhere
336 ulong count = Search(pattern, m, &sp, &ep);
343 bool CSA::IsLessThan(uchar const*) const
352 unsigned CSA::CountPrefix(uchar const * pattern) const
354 TextPosition m = strlen((char *)pattern);
356 return textLength.size();
358 TextPosition sp = 0, ep = 0;
359 Search(pattern, m, &sp, &ep);
361 // Count end-markers in result interval
362 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
363 it = endmarkers.lower_bound(sp);
365 while (it != endmarkers.end() && it->first <= ep)
374 unsigned CSA::CountSuffix(uchar const * pattern) const
376 TextPosition m = strlen((char *)pattern);
378 return textLength.size();
380 TextPosition sp = 0, ep = 0;
381 // Search with end-marker
382 unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
387 unsigned CSA::CountEqual(uchar const *pattern) const
389 TextPosition m = strlen((char const *)pattern);
391 return textLength.size();
393 TextPosition sp = 0, ep = 0;
394 // Match including end-marker
395 Search(pattern, m+1, &sp, &ep);
397 // Count end-markers in result interval
398 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
399 it = endmarkers.lower_bound(sp);
401 while (it != endmarkers.end() && it->first <= ep)
410 unsigned CSA::CountContains(uchar const * pattern) const
412 // Here counting is as slow as fetching the result set
413 // because we would have to filter out occ's that fall within same document.
414 TextCollection::document_result result = Contains(pattern);
415 return result.size();
418 unsigned CSA::CountLessThan(uchar const * pattern) const
425 * Document reporting queries
427 TextCollection::document_result CSA::Prefix(uchar const * pattern) const
429 TextPosition m = strlen((char *)pattern);
431 return TextCollection::document_result();
433 TextPosition sp = 0, ep = 0;
434 Search(pattern, m, &sp, &ep);
436 TextCollection::document_result result;
438 // Report end-markers in result interval
439 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
440 it = endmarkers.lower_bound(sp);
442 while (it != endmarkers.end() && it->first <= ep)
444 // End-marker we found belongs to the "previous" doc in the collection
445 DocId docId = ((it->second).first + 1) % textLength.size();
446 result.push_back(docId);
454 TextCollection::document_result CSA::Suffix(uchar const * pattern) const
456 TextPosition m = strlen((char *)pattern);
458 return TextCollection::document_result();
460 TextPosition sp = 0, ep = 0;
461 // Search with end-marker
462 Search(pattern, m+1, &sp, &ep);
464 TextCollection::document_result result;
466 // Check each occurrence
467 for (; sp <= ep; ++sp)
470 uchar c = alphabetrank->charAtPos(i);
471 while (c != '\0' && !sampled->IsBitSet(i))
473 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
474 c = alphabetrank->charAtPos(i);
478 // map::operator[] is not const, using find() instead:
479 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
480 // End-marker that we found belongs to the "previous" doc in collection:
481 DocId docId = (endm.first + 1) % textLength.size();
482 result.push_back(docId);
486 TextPosition textPos = suffixes[sampled->rank(i)-1];
487 result.push_back(DocIdAtTextPos(textPos));
494 TextCollection::document_result CSA::Equal(uchar const *pattern) const
496 TextPosition m = strlen((char const *)pattern);
498 return TextCollection::document_result();
500 TextPosition sp = 0, ep = 0;
501 // Match including end-marker
502 Search(pattern, m+1, &sp, &ep);
504 TextCollection::document_result result;
506 // Report end-markers in result interval
507 map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
508 it = endmarkers.lower_bound(sp);
510 while (it != endmarkers.end() && it->first <= ep)
512 // End-marker that we found belongs to the "previous" doc in collection:
513 DocId docId = ((it->second).first + 1) % textLength.size();
514 result.push_back(docId);
523 TextCollection::document_result CSA::Contains(uchar const * pattern) const
525 TextPosition m = strlen((char *)pattern);
527 return TextCollection::document_result();
529 TextPosition sp = 0, ep = 0;
530 // Search all occurrences
531 Search(pattern, m, &sp, &ep);
533 // We want unique document indentifiers, using std::set to collect them
534 std::set<DocId> resultSet;
536 // Check each occurrence
537 for (; sp <= ep; ++sp)
540 uchar c = alphabetrank->charAtPos(i);
541 while (c != '\0' && !sampled->IsBitSet(i))
543 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
544 c = alphabetrank->charAtPos(i);
548 // map::operator[] is not const, using find() instead:
549 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
550 // End-marker we found belongs to the "previous" doc in collection
551 DocId docId = (endm.first + 1) % textLength.size();
552 resultSet.insert(docId);
556 TextPosition textPos = suffixes[sampled->rank(i)-1];
557 resultSet.insert(DocIdAtTextPos(textPos));
561 // Convert std::set to std::vector (TODO better way to construct result vector?)
562 TextCollection::document_result result;
563 for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
564 result.push_back(*it);
568 TextCollection::document_result CSA::LessThan(uchar const * pattern) const
571 return document_result();
575 * Full result set queries
577 TextCollection::full_result CSA::FullContains(uchar const * pattern) const
579 TextPosition m = strlen((char *)pattern);
581 return full_result();
583 TextPosition sp = 0, ep = 0;
584 // Search all occurrences
585 Search(pattern, m, &sp, &ep);
589 // Report each occurrence
590 for (; sp <= ep; ++sp)
593 TextPosition dist = 0;
594 uchar c = alphabetrank->charAtPos(i);
595 while (c != '\0' && !sampled->IsBitSet(i))
597 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
598 c = alphabetrank->charAtPos(i);
603 // map::operator[] is not const, using find() instead:
604 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
605 // End-marker we found belongs to the "previous" doc in the collection:
606 DocId docId = (endm.first+1)%textLength.size();
607 result.push_back(make_pair(docId, dist));
611 TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
612 // Read document identifier and its starting position in text order
613 DocId docId = DocIdAtTextPos(textPos);
614 result.push_back(make_pair(docId, textPos - textLength[docId].second));
623 * Finds document identifier for given text position
625 * Starting text position of the document is stored into second parameter.
626 * Binary searching on text starting positions.
628 TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
633 DocId b = textLength.size() - 1;
636 DocId c = a + (b - a)/2;
637 if (textLength[c].second > i)
639 else if (textLength[c+1].second > i)
645 assert(a < (DocId)textLength.size());
646 assert(i > textLength[a].second);
647 assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
651 void CSA::Load(FILE *filename, unsigned samplerate)
657 void CSA::Save(FILE *filename)
664 * Rest of the functions follow...
667 uchar * CSA::Substring(TextPosition i, TextPosition l) const
669 uchar *result = new uchar[l + 1];
677 TextPosition k = i + l - 1;
678 // Check for end of the string
685 TextPosition skip = samplerate - k % samplerate - 1;
687 if (k / samplerate + 1 >= n / samplerate)
694 j = positions[k/samplerate+1];
695 //cout << samplerate << ' ' << j << '\n';
698 for (dist = 0; dist < skip + l; dist++)
700 int c = alphabetrank->charAtPos(j);
703 // map::operator[] is not const, using find() instead:
704 pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
705 j = endm.first; // LF-mapping for end-marker
708 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
710 result[l + skip - dist - 1] = c;
716 /*TextPosition CSA::inverse(TextPosition i)
718 TextPosition skip = samplerate - i % samplerate;
720 if (i / samplerate + 1 >= n / samplerate)
727 j = positions[i/samplerate+1];
728 //cout << samplerate << ' ' << j << '\n';
733 int c = alphabetrank->charAtPos(j);
734 j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
740 ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
741 // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
742 int c = (int)pattern[m-1];
744 TextPosition sp = C[c];
745 TextPosition ep = C[c+1]-1;
746 while (sp<=ep && i>=1)
748 // printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
749 c = (int)pattern[--i];
750 sp = C[c]+alphabetrank->rank(c,sp-1);
751 ep = C[c]+alphabetrank->rank(c,ep)-1;
761 /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
763 sp = C[(int)c]+alphabetrank->rank(c,sp-1);
764 ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
772 CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
775 while (!sampled->IsBitSet(i))
777 int c = alphabetrank->charAtPos(i);
780 // map::operator[] is not const, using find() instead:
781 pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
782 return endm.second + dist; // returns text position.
784 i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
788 return suffixes[sampled->rank(i)-1]+dist;
800 void CSA::maketables()
802 ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
804 ulong *sampledpositions = new ulong[n/W+1];
805 suffixes = new ulong[sampleLength];
806 positions = new ulong[sampleLength];
809 for (i=0;i<n/W+1;i++)
810 sampledpositions[i]=0lu;
813 DocId textId = textLength.size();
818 for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
819 // i substitutes SA->GetPos(i)
822 if (x % samplerate == 0) {
823 Tools::SetField(sampledpositions,1,p,1);
824 positions[x/samplerate] = p;
827 uchar c = alphabetrank->charAtPos(p);
830 // Record the order of end-markers in BWT:
832 endmarkers[p] = make_pair(textId, (TextPosition)x);
833 // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
834 p = textId; // Correct LF-mapping to the last char of the previous text.
837 p = C[c]+alphabetrank->rank(c, p)-1;
840 /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
842 printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
845 sampled = new BitRank(sampledpositions,n,true);
848 for(i=0; i<sampleLength; i++) {
849 j = sampled->rank(positions[i]);
850 if (j==0) j=sampleLength;
851 suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
855 uchar * CSA::LoadFromFile(const char *filename)
858 std::ifstream file (filename, std::ios::in|std::ios::binary);
861 std::cerr << "Loading CSA from file: " << filename << std::endl;
862 file.read((char *)&bwtEndPos, sizeof(TextPosition));
864 for (ulong offset = 0; offset < n; offset ++)
865 file.read((char *)(s + offset), sizeof(char));
870 std::cerr << "Unable to open file " << filename << std::endl;
876 void CSA::SaveToFile(const char *filename, uchar *bwt)
878 std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
881 std::cerr << "Writing CSA to file: " << filename << std::endl;
882 file.write((char *)&bwtEndPos, sizeof(TextPosition));
883 std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
884 for (ulong offset = 0; offset < n; offset ++)
885 file.write((char *)(bwt + offset), sizeof(char));
890 std::cerr << "Unable to open file " << filename << std::endl;
895 /*uchar * CSA::BWT(uchar *text)
899 DynFMI *wt = new DynFMI((uchar *) text, n);
901 for (ulong i=0;i<n;i++)
903 bwtEndPos = i; // TODO: better solution ?
911 CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
913 TCodeEntry *result = new TCodeEntry[ 256 ];
915 count_chars( text, n, result );
916 std::priority_queue< node, std::vector< node >, std::greater<node> > q;
918 // First I push all the leaf nodes into the queue
920 for ( unsigned int i = 0 ; i < 256 ; i++ )
921 if ( result[ i ].count )
922 q.push(node( i, result[ i ].count ) );
924 // This loop removes the two smallest nodes from the
925 // queue. It creates a new internal node that has
926 // those two nodes as children. The new internal node
927 // is then inserted into the priority queue. When there
928 // is only one node in the priority queue, the tree
932 while ( q.size() > 1 ) {
933 node *child0 = new node( q.top() );
935 node *child1 = new node( q.top() );
937 q.push( node( child0, child1 ) );
940 // Now I compute and return the codetable
942 q.top().maketable(0u,0u, result);
948 void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
952 child0->maketable( SetBit(code,bits,0), bits+1, codetable );
953 child1->maketable( SetBit(code,bits,1), bits+1, codetable );
959 codetable[value].code = code;
960 codetable[value].bits = bits;
964 void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
967 for (i = 0 ; i < 256 ; i++ )
968 counts[ i ].count = 0;
970 counts[(int)text[i]].count++;
973 unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
974 return x | (bit << pos);