add #include <cstring> so that strlen is properly defined
[SXSI/TextCollection.git] / dynFMI.cpp
1 /***************************************************************************
2  *   Copyright (C) 2006 by Wolfgang Gerlach   *
3  *      *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20
21
22
23 #include <iostream>
24 #include <cstdlib>
25 #include <fstream>
26 #include <stack>
27 #include <queue>
28 #include <functional>
29 #include <algorithm>
30 #include <cstring>
31
32 #include "dynFMI.h"
33
34 using std::cerr;
35 using std::cout;
36 using std::endl;
37 using std::ios;
38 using std::priority_queue;
39 using std::vector;
40
41
42
43 // -------- DynFMI --------
44
45 DynFMI::~DynFMI(){
46         deleteDynFMINodes(root);
47         root = 0;
48         delete[] leaves; //free(leaves) // ???;
49 #if SAMPLE!=0
50         delete SampledSAPositionsIndicator;
51 #endif
52         delete handle;
53         delete pos;
54 #if SAMPLE!=0
55         delete sampledSATree;
56 #endif
57 }
58
59 void DynFMI::deleteDynFMINodes(WaveletNode *n){
60         if (n->right) deleteDynFMINodes(n->right);
61         if (n->left) deleteDynFMINodes(n->left);
62         delete n;
63 }
64
65 DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
66         initEmptyDynFMI(text, x, n);
67         if (del) delete [] text;
68 }
69
70 DynFMI::DynFMI(char *readfile){
71
72         std::ifstream file(readfile);
73         if (!file)
74         {
75                 cerr << "error reading file: " << readfile << endl;
76                 exit(EXIT_FAILURE);
77         }
78         
79         file.seekg(0, ios::end);
80         ulong n = file.tellg();
81         file.seekg(0, ios::beg);
82         
83         uchar *text = new uchar[n];
84         
85         char c;
86         ulong i=0;
87         
88         
89         while (file.get(c))
90         {
91                 text[i]=c;
92                 i++;
93         }
94         file.close();
95         
96         initEmptyDynFMI(text, 1, n);
97         
98         
99         delete text;
100         
101 }
102
103 void DynFMI::iterateReset(){
104         iterate = 1;
105         recursiveIterateResetWaveletNode(root);
106 }
107
108 void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
109         w->bittree->iterateReset();
110         
111         if (w->left) recursiveIterateResetWaveletNode(w->left);
112         if (w->right) recursiveIterateResetWaveletNode(w->right);
113 }
114
115
116 bool DynFMI::iterateNext(){
117         iterate++;
118         return !(iterate > getSize());
119 }
120
121 uchar DynFMI::iterateGetSymbol(){
122
123         ulong i = iterate;
124         WaveletNode *walk = root;       
125         bool bit;
126                 
127         while (true) {
128                 
129                 bit = walk->bittree->iterateGetBit(); // TODO improve
130                 i=walk->bittree->iterateGetRank(bit);
131                 
132                 walk->bittree->iterateNext();
133                 
134                 
135                 if (bit) { //bit = 1
136                         if (walk->right == 0) return walk->c1;
137                         walk=walk->right;
138                 } else { // bit = 0
139                         if (walk->left == 0) return walk->c0;
140                         walk=walk->left;
141                 }
142                 
143                 
144         } // end of while
145         
146 }
147
148
149 uchar* DynFMI::getBWT(){
150         ulong n = root->bittree->getPositions();
151         n++;
152         uchar *text = new uchar[n];
153         
154         bool data=true;
155         // old slow version:
156         //for (ulong i=1; i <= root->bittree->getPositions(); i++)
157         //      text[i-1]=(*this)[i];
158         
159         ulong i = 0;
160         
161         iterateReset();
162         
163         while (data) {
164                 text[i] = iterateGetSymbol();   
165                 data = iterateNext();
166                 i++;
167         }
168         text[n-1]=0;
169         
170         return text;
171 }
172
173 std::ostream& DynFMI::getBWTStream(std::ostream& stream){
174         ulong n = root->bittree->getPositions();
175         n++;
176         //uchar *text = new uchar[n];
177         
178         bool data=true;
179         // old slow version:
180         //for (ulong i=1; i <= root->bittree->getPositions(); i++)
181         //      text[i-1]=(*this)[i];
182         
183         ulong i = 0;
184         
185         iterateReset();
186         
187         while (data) {
188                 //text[i] = iterateGetSymbol(); 
189                 stream << iterateGetSymbol();
190                 data = iterateNext();
191                 i++;
192         }
193         //text[n-1]=0;
194
195         return stream;
196 }
197
198 //void DynFMI::printDynFMIContent(ostream& stream){
199 //      uchar c;
200 //      for (ulong i=1; i<=getSize(); i++) 
201 //      {
202 //              c =(*this)[i];
203 //              if (c==0) c= '#';
204 //              stream << c;
205 //      }
206
207
208 void DynFMI::deleteLeaves(WaveletNode *node){
209         bool leaf = true;
210
211         if (node->left) {
212                 // internal node
213                 leaf = false;
214                 deleteLeaves(node->left);
215                 
216         }
217         if (node->right){
218                 leaf = false;
219                 deleteLeaves(node->right);
220         } 
221         
222         if (leaf) {
223                 // is a leaf, delete it!
224                 if (node->parent) {
225                         if (node==node->parent->left) node->parent->left=0;
226                                 else node->parent->right=0;
227                 }
228                 delete node;
229         }
230 }
231
232 void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
233         #ifndef NDEBUG
234         if (node == node->left) {
235                 cout << "makeCodes: autsch" << endl;
236                 exit(0);
237                 }
238         #endif
239
240         if (node->left) {
241                 makeCodes(code | (0 << bits), bits+1, node->left);
242                 makeCodes(code | (1 << bits), bits+1, node->right);
243         } else {
244                 codes[node->c0] = code;
245                 codelengths[node->c0] = bits+1;
246         }
247 }
248
249 void DynFMI::appendBVTrees(WaveletNode *node){
250         node->bittree = new BVTree();
251
252         if (node->left) appendBVTrees(node->left);
253         if (node->right) appendBVTrees(node->right);
254 }
255
256 void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
257         // pointers to the leaves for select
258         leaves = (WaveletNode**) new WaveletNode*[256];
259         for(int j=0; j<256; j++) leaves[j]=0;
260
261
262         ulong i;
263         for(i=0; i < n; i++) {
264                 if (text[i]==0) {
265                         cerr << "Error: unsigned char 0 is reserved." << endl;
266                         exit(EXIT_FAILURE);
267                 }
268                 
269                 if (leaves[text[i]]==0) {
270                         leaves[text[i]] = new WaveletNode(text[i]); 
271                 }
272                 leaves[text[i]]->weight++; 
273         }
274         
275         // separation symbol:
276         leaves[0] = new WaveletNode((uchar)0); 
277         leaves[0]->weight=x;
278         
279         // Veli's approach:
280         priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
281         
282         
283         for(int j=255; j>=0; j--){
284                 if (leaves[j]!=0) {
285                         q.push( (leaves[j]) );
286                 }
287                 codes[j] = 0;
288                 codelengths[j] = 0;
289         }
290         
291         // creates huffman shape:
292         while (q.size() > 1) {
293                 
294                 WaveletNode *left = q.top();
295                 q.pop();
296                 
297                 WaveletNode *right = q.top();
298                 q.pop();
299                 
300                 q.push(  new WaveletNode(left, right) );
301         }       
302         
303
304         root = q.top();
305         q.pop();
306         
307                         
308         makeCodes(0,0, root);   // writes codes and codelengths
309
310         
311         // merge leaves (one leaf represent two characters!)
312         for(int j=0; j<256; j++){
313         
314                 if (leaves[j]) {
315                 
316                         if (leaves[j]->parent->left==leaves[j]) {
317                                 leaves[j]->parent->c0=j;
318                         } else {
319                                 leaves[j]->parent->c1=j;
320                         }
321                         leaves[j]=leaves[j]->parent; // merge
322                 }
323         }
324         
325         deleteLeaves(root);
326         appendBVTrees(root);
327         
328         // array C needed for backwards search
329         for(int j=0; j<256+256; j++) C[j] = 0;
330         
331 #if SAMPLE!=0
332         this->SampledSAPositionsIndicator = new BVTree();
333         this->sampledSATree = new SampledSATree();
334 #endif
335         
336         this->pos = new Pos(sampleInterval);
337         this->handle = new Handle();
338         
339         pos->handle=this->handle;
340         handle->pos=this->pos;
341         
342 }
343
344 void DynFMI::append(uchar *text, ulong length){
345         ulong i;
346         ulong start = root->bittree->getPositions();
347         for (i = start; i<start+length; i++)
348                 insert(text[i-start], i+1);
349 }
350
351
352 void DynFMI::append(uchar c){
353         insert(c, root->bittree->getPositions()+1);
354 }
355
356 void DynFMI::insert(uchar c, ulong i){
357         #ifndef NDEBUG
358         if (codelengths[c]==0) {
359                 cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
360                 exit(EXIT_FAILURE);
361         }
362         #endif
363         
364         ulong level = 0;
365         ulong code = codes[c];
366
367         bool bit;
368         WaveletNode *walk = root;       
369                 
370         while (walk) {
371                 
372                 bit = ((code & (1u << level)) != 0); 
373                 
374                 walk->bittree->insertBit(bit,i); // TODO improve
375                 i=walk->bittree->rank(bit, i);
376
377                 if (bit) { //bit = 1
378                         walk=walk->right;
379                 } else { // bit = 0
380                         walk=walk->left;
381                 }
382                 
383                 level++;                
384         } // end of while
385         
386         int j = 256+c;
387         while(j>1) {
388                 C[j]++;
389                 j=binaryTree_parent(j);
390                 }
391         C[j]++; 
392         
393 }
394
395 void DynFMI::deleteSymbol(ulong i){
396         WaveletNode *walk = root;       
397         bool bit;
398         uchar c;
399         while (true) {
400
401                 // original slow version:
402                 //bit = (*walk->bittree)[i];
403                 //old_i = i;            
404                 //i=walk->bittree->rank(bit, i);        
405                 //walk->bittree->deleteBit(old_i);
406
407
408                 walk->bittree->deleteBit(i);
409                 bit=walk->bittree->getLastBitDeleted();
410                 i=walk->bittree->getLastRank();
411                 
412                 if (bit) { //bit = 1
413                         if (walk->right == 0) {
414                                 c = walk->c1;
415                                 break;
416                                 }
417                         walk=walk->right;
418                 } else { // bit = 0
419                         if (walk->left == 0) {
420                                 c = walk->c0;
421                                 break;
422                         }
423                         walk=walk->left;
424                 }
425                 
426                 
427         } // end of while
428         
429         int j = 256+c;
430         while(j>1) {
431                 C[j]--;
432                 j=binaryTree_parent(j);
433                 }
434         C[j]--; 
435         
436         
437 }
438
439
440 uchar DynFMI::operator[](ulong i){
441         WaveletNode *walk = root;       
442         bool bit;
443                 
444         while (true) {
445                 
446                 bit = (*walk->bittree)[i]; //TODO improve by reducing
447                 i=walk->bittree->rank(bit, i);
448
449                 if (bit) { //bit = 1
450                         if (walk->right == 0) return walk->c1;
451                         walk=walk->right;
452                 } else { // bit = 0
453                         if (walk->left == 0) return walk->c0;
454                         walk=walk->left;
455                 }
456                 
457                 
458         } // end of while
459         cout << endl;
460 }
461
462 ulong DynFMI::rank(uchar c, ulong i){
463         if (i==0) return 0;
464
465         ulong level = 0;
466         ulong code = codes[c];
467         
468         
469         bool bit;
470         WaveletNode *walk = root;       
471                 
472         while (true) {
473                 
474                 bit = ((code & (1u << level)) != 0);
475                 
476                 i=walk->bittree->rank(bit, i);
477                 if (bit) { //bit = 1
478                         if (walk->right == 0) return i;
479                         walk=walk->right;
480                 } else { // bit = 0
481                         if (walk->left == 0) return i;
482                         walk=walk->left;
483                 }
484         
485                 level++;                
486         } // end of while
487         
488         cerr << "error: DynFMI::rank: left while loop" << endl;
489         exit(EXIT_FAILURE);
490         return 0; //never
491 }
492
493 ulong DynFMI::select(uchar c, ulong i){
494         
495         WaveletNode *walk = leaves[c];  
496         
497         bool bit = (walk->c1==c);
498         
499         while (walk->parent) {
500                 i=walk->bittree->select(bit, i);
501                 
502                 bit = (walk == walk->parent->right);
503                 walk=walk->parent;
504         } // end of while
505         
506         i=walk->bittree->select(bit, i);
507
508         return i;
509 }
510
511
512 ulong DynFMI::count(uchar *pattern){
513
514         ulong i = strlen((char*)pattern);
515         ulong sp = 1;
516
517         ulong ep = getSize();
518         uchar s;
519         while ((sp <= ep) && (i >=1)) {
520                 s=pattern[i-1];
521                 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
522                 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
523                 
524                 i--;
525         }
526         return ep-sp +1;
527 }
528
529 #if SAMPLE!=0
530 ulong* DynFMI::locate(uchar *pattern){
531
532         ulong i = strlen((char*)pattern);
533         ulong sp = 1;
534
535         ulong ep = getSize();
536         uchar s;
537         while ((sp <= ep) && (i >=1)) {
538                 s=pattern[i-1];
539                 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
540                 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
541                 
542                 i--;
543         }
544
545         ulong numberOfMatches = ep-sp +1;
546         ulong *matches = new ulong[2*numberOfMatches + 1];
547         matches[0] = numberOfMatches;
548
549         ulong diff;
550         ulong k = 1;
551         cout << "----------------------" << endl;       
552         for (ulong j=sp; j <= ep; j++) {
553
554                 i=j;
555                 diff = 0;
556                 while (!(*SampledSAPositionsIndicator)[i]) {
557                         diff++;
558                         s=(*this)[i];
559                         i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
560         
561                 }
562         
563
564                 sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
565
566                 //cout << "found: (" << sampledSATree->handle  << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
567                 matches[k++] = sampledSATree->handle;
568                 matches[k++] = sampledSATree->sampledSAValue + diff;
569         }
570
571         return matches;
572 }
573 #endif
574
575 // size must include endmarker!
576 ulong DynFMI::addText(uchar const * str, ulong n){
577         ulong i;
578 #if SAMPLE!=0
579         bool sample = false;
580 #endif
581         i=pos->getSize()+1;
582         
583         ulong key = pos->appendText(n);
584
585         // Adding an empty text (end-marker)
586         if (n == 1)
587         {
588             insert(str[0], i);
589             //if (del) delete [] str;
590             return key;
591         }
592
593         insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
594 #if SAMPLE!=0
595         sample = (n-1 % sampleInterval == 0);
596         SampledSAPositionsIndicator->insertBit(sample, i);      
597         if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
598 #endif
599         for (ulong t=n-2; t > 0; t--) {
600                 i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
601                 insert(str[t-1],i);
602 #if SAMPLE!=0
603                 sample = ((t) % sampleInterval == 0);
604                 SampledSAPositionsIndicator->insertBit(sample, i);
605
606                 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
607 #endif
608         }
609         
610
611         i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
612         insert(str[n-1],i);
613 #if SAMPLE!=0
614         SampledSAPositionsIndicator->insertBit(true, i); 
615         sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
616 #endif  
617 //      if (del)  delete [] str;
618         
619         return key;
620 }
621
622
623 ulong DynFMI::addTextFromFile(char* filename, ulong n){
624         ulong i;
625 #if SAMPLE!=0
626         bool sample = false;
627 #endif
628         i=pos->getSize()+1;
629
630
631         std::ifstream str(filename, ios::binary);
632         if (!str)
633         {
634                 cerr << "error reading file: " << filename << endl;
635                 exit(EXIT_FAILURE);
636         }
637
638         char c;
639         //char oldc;
640
641
642         ulong buf_len; //1048576; //10MB
643         char * buffer;
644         ulong buf_pos;
645
646         if (BUFFER==0) buf_len = n;
647                 else buf_len = BUFFER;
648
649         buffer = new char[buf_len];
650         buf_pos = (n-2) / buf_len;
651         str.seekg(buf_pos*buf_len);
652         str.read(buffer, buf_len);
653         str.clear(std::ios_base::goodbit);
654
655
656         ulong key = pos->appendText(n);
657         
658         //str.seekg(n-2);
659
660         //c=str.get();
661         c=buffer[(n-2)%buf_len];
662         insert(c,i); // insert second last character, corresponds to suffix of length 1
663 #if SAMPLE!=0
664         sample = (n-1 % sampleInterval == 0);
665         SampledSAPositionsIndicator->insertBit(sample, i);      
666         if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
667 #endif
668         if ((n-2)%buf_len == 0) {
669                 c=buffer[0];
670                 buf_pos--;
671                 str.seekg(buf_pos*buf_len);
672                 str.read(buffer, buf_len);
673         }
674         for (ulong t=n-2; t > 0; t--) {
675
676                 i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
677
678                 c=buffer[(t-1)%buf_len];
679                 insert(c,i);
680
681                 //check buffer
682                 if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
683 #ifndef NDEBUG
684                         if (buf_pos == 0) {
685                                 cerr << "buf_pos too small" << endl;
686                                 exit(0);
687                         }
688 #endif
689                         buf_pos--;
690                         str.seekg(buf_pos*buf_len);
691                         str.read(buffer, buf_len);
692                 }
693
694 #if SAMPLE!=0
695                 sample = ((t) % sampleInterval == 0);
696                 SampledSAPositionsIndicator->insertBit(sample, i);
697
698                 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
699 #endif
700         }
701
702         i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
703         insert(c,i);
704
705 #if SAMPLE!=0
706         SampledSAPositionsIndicator->insertBit(true, i); 
707         sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
708 #endif  
709
710         str.close();
711
712         return key;
713 }
714
715
716 uchar* DynFMI::retrieveText(ulong key){
717         cout << "key: " << key << endl;
718         uchar *text=0;
719         // TODO access two times, too much
720         ulong i=handle->getPos(key); //=bwtEnd;
721         if (i == 0 ) {
722                 return text;
723         }
724
725         ulong n=pos->getTextSize(i);
726         
727         text = new uchar[n]; // last byte 0 for cout
728         
729         uchar c;
730         
731 //TODO better   
732         for (ulong t=n-2; t < n; t--) {
733                 c = (*this)[i];
734                 text[t]=(c==0?'#':c);
735                 
736                 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
737         }
738
739         #ifndef NDEBUG          
740         for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
741         #endif  
742         
743         text[n-1] = 0;
744         return text;
745 }
746
747
748 bool DynFMI::deleteText(ulong key){
749         // TODO access two times, can do better
750         ulong i=handle->getPos(key); //=bwtEnd;
751         if (i == 0) return false;
752
753         ulong n=pos->getTextSize(i);
754         ulong *text = new ulong[n]; 
755         uchar c;
756         
757         for (ulong t=n-1; t <n; t--) {
758                 c = (*this)[i];
759                 text[t]=i;
760                 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
761         }
762
763
764         std::sort(text, text + n);
765
766
767         for (i=n-1; i<n; i--) {
768                 this->deleteSymbol(text[i]); 
769 #if SAMPLE!=0
770                 SampledSAPositionsIndicator->deleteBit(text[i]);                
771 //TODO samples ?
772 #endif
773         }
774         delete[] text;
775         handle->deleteKey(key);
776         
777                 
778         return true;
779 }
780
781 ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
782         int j = 256+c;
783         ulong r=0;
784         while(j>1) {
785                 if (binaryTree_isRightChild(j)) 
786                         r += C[binaryTree_left(binaryTree_parent(j))];
787                 
788                 j=binaryTree_parent(j);
789         }
790         return r;
791 }
792
793
794
795
796 void DynFMI::printDynFMIContent(std::ostream& stream){
797         uchar c;
798         for (ulong i=1; i<=getSize(); i++) 
799         {
800                 c =(*this)[i];
801                 if (c==0) c= '#';
802                 stream << c;
803         }
804 }
805
806
807
808