Debug swcsa
[SXSI/TextCollection.git] / dynFMI.cpp
1 /***************************************************************************
2  *   Copyright (C) 2006 by Wolfgang Gerlach   *
3  *      *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20
21
22
23 #include <iostream>
24 #include <cstdlib>
25 #include <fstream>
26 #include <stack>
27 #include <queue>
28 #include <functional>
29 #include <algorithm>
30 #include <cstring>
31
32 #include "dynFMI.h"
33
34 using std::cerr;
35 using std::cout;
36 using std::endl;
37 using std::ios;
38 using std::priority_queue;
39 using std::vector;
40
41
42
43 // -------- DynFMI --------
44
45 DynFMI::~DynFMI(){
46         deleteDynFMINodes(root);
47         root = 0;
48         delete[] leaves; //free(leaves) // ???;
49 #if SAMPLE!=0
50         delete SampledSAPositionsIndicator;
51 #endif
52 //      delete handle;
53 //      delete pos;
54 #if SAMPLE!=0
55         delete sampledSATree;
56 #endif
57 }
58
59 void DynFMI::deleteDynFMINodes(WaveletNode *n){
60         if (n->right) deleteDynFMINodes(n->right);
61         if (n->left) deleteDynFMINodes(n->left);
62         delete n;
63 }
64
65 DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
66     numberOfTexts = 0;
67         initEmptyDynFMI(text, x, n);
68         if (del) delete [] text;
69 }
70
71 DynFMI::DynFMI(char *readfile){
72
73         std::ifstream file(readfile);
74         if (!file)
75         {
76                 cerr << "error reading file: " << readfile << endl;
77                 exit(EXIT_FAILURE);
78         }
79         
80         file.seekg(0, ios::end);
81         ulong n = file.tellg();
82         file.seekg(0, ios::beg);
83         
84         uchar *text = new uchar[n];
85         
86         char c;
87         ulong i=0;
88         
89         
90         while (file.get(c))
91         {
92                 text[i]=c;
93                 i++;
94         }
95         file.close();
96         
97         initEmptyDynFMI(text, 1, n);
98         
99         
100         delete text;
101         
102 }
103
104 void DynFMI::iterateReset(){
105         iterate = 1;
106         recursiveIterateResetWaveletNode(root);
107 }
108
109 void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
110         w->bittree->iterateReset();
111         
112         if (w->left) recursiveIterateResetWaveletNode(w->left);
113         if (w->right) recursiveIterateResetWaveletNode(w->right);
114 }
115
116
117 bool DynFMI::iterateNext(){
118         iterate++;
119         return !(iterate > getSize());
120 }
121
122 uchar DynFMI::iterateGetSymbol(){
123
124         ulong i = iterate;
125         WaveletNode *walk = root;       
126         bool bit;
127                 
128         while (true) {
129                 
130                 bit = walk->bittree->iterateGetBit(); // TODO improve
131                 i=walk->bittree->iterateGetRank(bit);
132                 
133                 walk->bittree->iterateNext();
134                 
135                 
136                 if (bit) { //bit = 1
137                         if (walk->right == 0) return walk->c1;
138                         walk=walk->right;
139                 } else { // bit = 0
140                         if (walk->left == 0) return walk->c0;
141                         walk=walk->left;
142                 }
143                 
144                 
145         } // end of while
146         
147 }
148
149
150 uchar* DynFMI::getBWT(){
151         ulong n = root->bittree->getPositions();
152         n++;
153         uchar *text = new uchar[n];
154         
155         bool data=true;
156         // old slow version:
157         //for (ulong i=1; i <= root->bittree->getPositions(); i++)
158         //      text[i-1]=(*this)[i];
159         
160         ulong i = 0;
161         
162         iterateReset();
163         
164         while (data) {
165                 text[i] = iterateGetSymbol();   
166                 data = iterateNext();
167                 i++;
168         }
169         text[n-1]=0;
170         
171         return text;
172 }
173
174 std::ostream& DynFMI::getBWTStream(std::ostream& stream){
175         ulong n = root->bittree->getPositions();
176         n++;
177         //uchar *text = new uchar[n];
178         
179         bool data=true;
180         // old slow version:
181         //for (ulong i=1; i <= root->bittree->getPositions(); i++)
182         //      text[i-1]=(*this)[i];
183         
184         ulong i = 0;
185         
186         iterateReset();
187         
188         while (data) {
189                 //text[i] = iterateGetSymbol(); 
190                 stream << iterateGetSymbol();
191                 data = iterateNext();
192                 i++;
193         }
194         //text[n-1]=0;
195
196         return stream;
197 }
198
199 //void DynFMI::printDynFMIContent(ostream& stream){
200 //      uchar c;
201 //      for (ulong i=1; i<=getSize(); i++) 
202 //      {
203 //              c =(*this)[i];
204 //              if (c==0) c= '#';
205 //              stream << c;
206 //      }
207
208
209 void DynFMI::deleteLeaves(WaveletNode *node){
210         bool leaf = true;
211
212         if (node->left) {
213                 // internal node
214                 leaf = false;
215                 deleteLeaves(node->left);
216                 
217         }
218         if (node->right){
219                 leaf = false;
220                 deleteLeaves(node->right);
221         } 
222         
223         if (leaf) {
224                 // is a leaf, delete it!
225                 if (node->parent) {
226                         if (node==node->parent->left) node->parent->left=0;
227                                 else node->parent->right=0;
228                 }
229                 delete node;
230         }
231 }
232
233 void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
234         #ifndef NDEBUG
235         if (node == node->left) {
236                 cout << "makeCodes: autsch" << endl;
237                 exit(0);
238                 }
239         #endif
240
241         if (node->left) {
242                 makeCodes(code | (0 << bits), bits+1, node->left);
243                 makeCodes(code | (1 << bits), bits+1, node->right);
244         } else {
245                 codes[node->c0] = code;
246                 codelengths[node->c0] = bits+1;
247         }
248 }
249
250 void DynFMI::appendBVTrees(WaveletNode *node){
251         node->bittree = new BVTree();
252
253         if (node->left) appendBVTrees(node->left);
254         if (node->right) appendBVTrees(node->right);
255 }
256
257 void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
258         // pointers to the leaves for select
259         leaves = (WaveletNode**) new WaveletNode*[256];
260         for(int j=0; j<256; j++) leaves[j]=0;
261
262
263         ulong i;
264         for(i=0; i < n; i++) {
265                 if (text[i]==0) {
266                         cerr << "Error: unsigned char 0 is reserved." << endl;
267                         exit(EXIT_FAILURE);
268                 }
269                 
270                 if (leaves[text[i]]==0) {
271                         leaves[text[i]] = new WaveletNode(text[i]); 
272                 }
273                 leaves[text[i]]->weight++; 
274         }
275         
276         // separation symbol:
277         leaves[0] = new WaveletNode((uchar)0); 
278         leaves[0]->weight=x;
279         
280         // Veli's approach:
281         priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
282         
283         
284         for(int j=255; j>=0; j--){
285                 if (leaves[j]!=0) {
286                         q.push( (leaves[j]) );
287                 }
288                 codes[j] = 0;
289                 codelengths[j] = 0;
290         }
291         
292         // creates huffman shape:
293         while (q.size() > 1) {
294                 
295                 WaveletNode *left = q.top();
296                 q.pop();
297                 
298                 WaveletNode *right = q.top();
299                 q.pop();
300                 
301                 q.push(  new WaveletNode(left, right) );
302         }       
303         
304
305         root = q.top();
306         q.pop();
307         
308                         
309         makeCodes(0,0, root);   // writes codes and codelengths
310
311         
312         // merge leaves (one leaf represent two characters!)
313         for(int j=0; j<256; j++){
314         
315                 if (leaves[j]) {
316                 
317                         if (leaves[j]->parent->left==leaves[j]) {
318                                 leaves[j]->parent->c0=j;
319                         } else {
320                                 leaves[j]->parent->c1=j;
321                         }
322                         leaves[j]=leaves[j]->parent; // merge
323                 }
324         }
325         
326         deleteLeaves(root);
327         appendBVTrees(root);
328         
329         // array C needed for backwards search
330         for(int j=0; j<256+256; j++) C[j] = 0;
331         
332 #if SAMPLE!=0
333         this->SampledSAPositionsIndicator = new BVTree();
334         this->sampledSATree = new SampledSATree();
335 #endif
336         
337         //this->pos = new Pos(sampleInterval);
338         //this->handle = new Handle();
339         
340         //pos->handle=this->handle;
341         //handle->pos=this->pos;
342         
343 }
344
345 void DynFMI::append(uchar *text, ulong length){
346         ulong i;
347         ulong start = root->bittree->getPositions();
348         for (i = start; i<start+length; i++)
349                 insert(text[i-start], i+1);
350 }
351
352
353 void DynFMI::append(uchar c){
354         insert(c, root->bittree->getPositions()+1);
355 }
356
357 void DynFMI::insert(uchar c, ulong i){
358         #ifndef NDEBUG
359         if (codelengths[c]==0) {
360                 cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
361                 exit(EXIT_FAILURE);
362         }
363         #endif
364         
365         ulong level = 0;
366         ulong code = codes[c];
367
368         bool bit;
369         WaveletNode *walk = root;       
370                 
371         while (walk) {
372                 
373                 bit = ((code & (1u << level)) != 0); 
374                 
375                 walk->bittree->insertBit(bit,i); // TODO improve
376                 i=walk->bittree->rank(bit, i);
377
378                 if (bit) { //bit = 1
379                         walk=walk->right;
380                 } else { // bit = 0
381                         walk=walk->left;
382                 }
383                 
384                 level++;                
385         } // end of while
386         
387         int j = 256+c;
388         while(j>1) {
389                 C[j]++;
390                 j=binaryTree_parent(j);
391                 }
392         C[j]++; 
393         
394 }
395
396 void DynFMI::deleteSymbol(ulong i){
397         WaveletNode *walk = root;       
398         bool bit;
399         uchar c;
400         while (true) {
401
402                 // original slow version:
403                 //bit = (*walk->bittree)[i];
404                 //old_i = i;            
405                 //i=walk->bittree->rank(bit, i);        
406                 //walk->bittree->deleteBit(old_i);
407
408
409                 walk->bittree->deleteBit(i);
410                 bit=walk->bittree->getLastBitDeleted();
411                 i=walk->bittree->getLastRank();
412                 
413                 if (bit) { //bit = 1
414                         if (walk->right == 0) {
415                                 c = walk->c1;
416                                 break;
417                                 }
418                         walk=walk->right;
419                 } else { // bit = 0
420                         if (walk->left == 0) {
421                                 c = walk->c0;
422                                 break;
423                         }
424                         walk=walk->left;
425                 }
426                 
427                 
428         } // end of while
429         
430         int j = 256+c;
431         while(j>1) {
432                 C[j]--;
433                 j=binaryTree_parent(j);
434                 }
435         C[j]--; 
436         
437         
438 }
439
440
441 uchar DynFMI::operator[](ulong i){
442         WaveletNode *walk = root;       
443         bool bit;
444                 
445         while (true) {
446                 
447                 bit = (*walk->bittree)[i]; //TODO improve by reducing
448                 i=walk->bittree->rank(bit, i);
449
450                 if (bit) { //bit = 1
451                         if (walk->right == 0) return walk->c1;
452                         walk=walk->right;
453                 } else { // bit = 0
454                         if (walk->left == 0) return walk->c0;
455                         walk=walk->left;
456                 }
457                 
458                 
459         } // end of while
460         cout << endl;
461 }
462
463 ulong DynFMI::rank(uchar c, ulong i){
464         if (i==0) return 0;
465
466         ulong level = 0;
467         ulong code = codes[c];
468         
469         
470         bool bit;
471         WaveletNode *walk = root;       
472                 
473         while (true) {
474                 
475                 bit = ((code & (1u << level)) != 0);
476                 
477                 i=walk->bittree->rank(bit, i);
478                 if (bit) { //bit = 1
479                         if (walk->right == 0) return i;
480                         walk=walk->right;
481                 } else { // bit = 0
482                         if (walk->left == 0) return i;
483                         walk=walk->left;
484                 }
485         
486                 level++;                
487         } // end of while
488         
489         cerr << "error: DynFMI::rank: left while loop" << endl;
490         exit(EXIT_FAILURE);
491         return 0; //never
492 }
493
494 ulong DynFMI::select(uchar c, ulong i){
495         
496         WaveletNode *walk = leaves[c];  
497         
498         bool bit = (walk->c1==c);
499         
500         while (walk->parent) {
501                 i=walk->bittree->select(bit, i);
502                 
503                 bit = (walk == walk->parent->right);
504                 walk=walk->parent;
505         } // end of while
506         
507         i=walk->bittree->select(bit, i);
508
509         return i;
510 }
511
512
513 ulong DynFMI::count(uchar *pattern){
514
515         ulong i = strlen((char*)pattern);
516         ulong sp = 1;
517
518         ulong ep = getSize();
519         uchar s;
520         while ((sp <= ep) && (i >=1)) {
521                 s=pattern[i-1];
522                 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
523                 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
524                 
525                 i--;
526         }
527         return ep-sp +1;
528 }
529
530 #if SAMPLE!=0
531 ulong* DynFMI::locate(uchar *pattern){
532
533         ulong i = strlen((char*)pattern);
534         ulong sp = 1;
535
536         ulong ep = getSize();
537         uchar s;
538         while ((sp <= ep) && (i >=1)) {
539                 s=pattern[i-1];
540                 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
541                 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
542                 
543                 i--;
544         }
545
546         ulong numberOfMatches = ep-sp +1;
547         ulong *matches = new ulong[2*numberOfMatches + 1];
548         matches[0] = numberOfMatches;
549
550         ulong diff;
551         ulong k = 1;
552         cout << "----------------------" << endl;       
553         for (ulong j=sp; j <= ep; j++) {
554
555                 i=j;
556                 diff = 0;
557                 while (!(*SampledSAPositionsIndicator)[i]) {
558                         diff++;
559                         s=(*this)[i];
560                         i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
561         
562                 }
563         
564
565                 sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
566
567                 //cout << "found: (" << sampledSATree->handle  << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
568                 matches[k++] = sampledSATree->handle;
569                 matches[k++] = sampledSATree->sampledSAValue + diff;
570         }
571
572         return matches;
573 }
574 #endif
575
576 // size must include endmarker!
577 void DynFMI::addText(uchar const * str, ulong n){
578         ulong i;
579 #if SAMPLE!=0
580         bool sample = false;
581 #endif
582         i = numberOfTexts + 1;
583  
584         numberOfTexts ++;
585
586         // Adding an empty text (end-marker)
587         if (n == 1)
588         {
589             insert(str[0], i);
590             //if (del) delete [] str;
591             return;
592         }
593
594         insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
595 #if SAMPLE!=0
596         sample = (n-1 % sampleInterval == 0);
597         SampledSAPositionsIndicator->insertBit(sample, i);      
598         if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
599 #endif
600         for (ulong t=n-2; t > 0; t--) {
601                 i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
602                 insert(str[t-1],i);
603 #if SAMPLE!=0
604                 sample = ((t) % sampleInterval == 0);
605                 SampledSAPositionsIndicator->insertBit(sample, i);
606
607                 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
608 #endif
609         }
610         
611
612         i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
613         insert(str[n-1],i);
614 #if SAMPLE!=0
615         SampledSAPositionsIndicator->insertBit(true, i); 
616         sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
617 #endif  
618 //      if (del)  delete [] str;
619         
620 //      return key;
621 }
622
623
624 /*ulong DynFMI::addTextFromFile(char* filename, ulong n){
625         ulong i;
626 #if SAMPLE!=0
627         bool sample = false;
628 #endif
629         i=pos->getSize()+1;
630
631
632         std::ifstream str(filename, ios::binary);
633         if (!str)
634         {
635                 cerr << "error reading file: " << filename << endl;
636                 exit(EXIT_FAILURE);
637         }
638
639         char c;
640         //char oldc;
641
642
643         ulong buf_len; //1048576; //10MB
644         char * buffer;
645         ulong buf_pos;
646
647         if (BUFFER==0) buf_len = n;
648                 else buf_len = BUFFER;
649
650         buffer = new char[buf_len];
651         buf_pos = (n-2) / buf_len;
652         str.seekg(buf_pos*buf_len);
653         str.read(buffer, buf_len);
654         str.clear(std::ios_base::goodbit);
655
656
657         ulong key = pos->appendText(n);
658         
659         //str.seekg(n-2);
660
661         //c=str.get();
662         c=buffer[(n-2)%buf_len];
663         insert(c,i); // insert second last character, corresponds to suffix of length 1
664 #if SAMPLE!=0
665         sample = (n-1 % sampleInterval == 0);
666         SampledSAPositionsIndicator->insertBit(sample, i);      
667         if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
668 #endif
669         if ((n-2)%buf_len == 0) {
670                 c=buffer[0];
671                 buf_pos--;
672                 str.seekg(buf_pos*buf_len);
673                 str.read(buffer, buf_len);
674         }
675         for (ulong t=n-2; t > 0; t--) {
676
677                 i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
678
679                 c=buffer[(t-1)%buf_len];
680                 insert(c,i);
681
682                 //check buffer
683                 if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
684 #ifndef NDEBUG
685                         if (buf_pos == 0) {
686                                 cerr << "buf_pos too small" << endl;
687                                 exit(0);
688                         }
689 #endif
690                         buf_pos--;
691                         str.seekg(buf_pos*buf_len);
692                         str.read(buffer, buf_len);
693                 }
694
695 #if SAMPLE!=0
696                 sample = ((t) % sampleInterval == 0);
697                 SampledSAPositionsIndicator->insertBit(sample, i);
698
699                 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
700 #endif
701         }
702
703         i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
704         insert(c,i);
705
706 #if SAMPLE!=0
707         SampledSAPositionsIndicator->insertBit(true, i); 
708         sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
709 #endif  
710
711         str.close();
712
713         return key;
714 }
715 */
716 /*
717 uchar* DynFMI::retrieveText(ulong key){
718         cout << "key: " << key << endl;
719         uchar *text=0;
720         // TODO access two times, too much
721         ulong i=handle->getPos(key); //=bwtEnd;
722         if (i == 0 ) {
723                 return text;
724         }
725
726         ulong n=pos->getTextSize(i);
727         
728         text = new uchar[n]; // last byte 0 for cout
729         
730         uchar c;
731         
732 //TODO better   
733         for (ulong t=n-2; t < n; t--) {
734                 c = (*this)[i];
735                 text[t]=(c==0?'#':c);
736                 
737                 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
738         }
739
740         #ifndef NDEBUG          
741         for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
742         #endif  
743         
744         text[n-1] = 0;
745         return text;
746         }*/
747
748 /*
749 bool DynFMI::deleteText(ulong key){
750         // TODO access two times, can do better
751         ulong i=handle->getPos(key); //=bwtEnd;
752         if (i == 0) return false;
753
754         ulong n=pos->getTextSize(i);
755         ulong *text = new ulong[n]; 
756         uchar c;
757         
758         for (ulong t=n-1; t <n; t--) {
759                 c = (*this)[i];
760                 text[t]=i;
761                 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
762         }
763
764
765         std::sort(text, text + n);
766
767
768         for (i=n-1; i<n; i--) {
769                 this->deleteSymbol(text[i]); 
770 #if SAMPLE!=0
771                 SampledSAPositionsIndicator->deleteBit(text[i]);                
772 //TODO samples ?
773 #endif
774         }
775         delete[] text;
776         handle->deleteKey(key);
777         
778                 
779         return true;
780 }
781 */
782 ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
783         int j = 256+c;
784         ulong r=0;
785         while(j>1) {
786                 if (binaryTree_isRightChild(j)) 
787                         r += C[binaryTree_left(binaryTree_parent(j))];
788                 
789                 j=binaryTree_parent(j);
790         }
791         return r;
792 }
793
794
795
796
797 void DynFMI::printDynFMIContent(std::ostream& stream){
798         uchar c;
799         for (ulong i=1; i<=getSize(); i++) 
800         {
801                 c =(*this)[i];
802                 if (c==0) c= '#';
803                 stream << c;
804         }
805 }
806
807
808
809