Initial Import
[SXSI/TextCollection.git] / dynFMI.cpp
1 /***************************************************************************
2  *   Copyright (C) 2006 by Wolfgang Gerlach   *
3  *      *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20
21
22
23 #include <iostream>
24 #include <cstdlib>
25 #include <fstream>
26 #include <stack>
27 #include <queue>
28 #include <functional>
29 #include <algorithm>
30
31 #include "dynFMI.h"
32
33 using std::cerr;
34 using std::cout;
35 using std::endl;
36 using std::ios;
37 using std::priority_queue;
38 using std::vector;
39
40
41
42 // -------- DynFMI --------
43
44 DynFMI::~DynFMI(){
45         deleteDynFMINodes(root);
46         root = 0;
47         delete[] leaves; //free(leaves) // ???;
48 #if SAMPLE!=0
49         delete SampledSAPositionsIndicator;
50 #endif
51         delete handle;
52         delete pos;
53 #if SAMPLE!=0
54         delete sampledSATree;
55 #endif
56 }
57
58 void DynFMI::deleteDynFMINodes(WaveletNode *n){
59         if (n->right) deleteDynFMINodes(n->right);
60         if (n->left) deleteDynFMINodes(n->left);
61         delete n;
62 }
63
64 DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
65         initEmptyDynFMI(text, x, n);
66         if (del) delete [] text;
67 }
68
69 DynFMI::DynFMI(char *readfile){
70
71         std::ifstream file(readfile);
72         if (!file)
73         {
74                 cerr << "error reading file: " << readfile << endl;
75                 exit(EXIT_FAILURE);
76         }
77         
78         file.seekg(0, ios::end);
79         ulong n = file.tellg();
80         file.seekg(0, ios::beg);
81         
82         uchar *text = new uchar[n];
83         
84         char c;
85         ulong i=0;
86         
87         
88         while (file.get(c))
89         {
90                 text[i]=c;
91                 i++;
92         }
93         file.close();
94         
95         initEmptyDynFMI(text, 1, n);
96         
97         
98         delete text;
99         
100 }
101
102 void DynFMI::iterateReset(){
103         iterate = 1;
104         recursiveIterateResetWaveletNode(root);
105 }
106
107 void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
108         w->bittree->iterateReset();
109         
110         if (w->left) recursiveIterateResetWaveletNode(w->left);
111         if (w->right) recursiveIterateResetWaveletNode(w->right);
112 }
113
114
115 bool DynFMI::iterateNext(){
116         iterate++;
117         return !(iterate > getSize());
118 }
119
120 uchar DynFMI::iterateGetSymbol(){
121
122         ulong i = iterate;
123         WaveletNode *walk = root;       
124         bool bit;
125                 
126         while (true) {
127                 
128                 bit = walk->bittree->iterateGetBit(); // TODO improve
129                 i=walk->bittree->iterateGetRank(bit);
130                 
131                 walk->bittree->iterateNext();
132                 
133                 
134                 if (bit) { //bit = 1
135                         if (walk->right == 0) return walk->c1;
136                         walk=walk->right;
137                 } else { // bit = 0
138                         if (walk->left == 0) return walk->c0;
139                         walk=walk->left;
140                 }
141                 
142                 
143         } // end of while
144         
145 }
146
147
148 uchar* DynFMI::getBWT(){
149         ulong n = root->bittree->getPositions();
150         n++;
151         uchar *text = new uchar[n];
152         
153         bool data=true;
154         // old slow version:
155         //for (ulong i=1; i <= root->bittree->getPositions(); i++)
156         //      text[i-1]=(*this)[i];
157         
158         ulong i = 0;
159         
160         iterateReset();
161         
162         while (data) {
163                 text[i] = iterateGetSymbol();   
164                 data = iterateNext();
165                 i++;
166         }
167         text[n-1]=0;
168         
169         return text;
170 }
171
172 std::ostream& DynFMI::getBWTStream(std::ostream& stream){
173         ulong n = root->bittree->getPositions();
174         n++;
175         //uchar *text = new uchar[n];
176         
177         bool data=true;
178         // old slow version:
179         //for (ulong i=1; i <= root->bittree->getPositions(); i++)
180         //      text[i-1]=(*this)[i];
181         
182         ulong i = 0;
183         
184         iterateReset();
185         
186         while (data) {
187                 //text[i] = iterateGetSymbol(); 
188                 stream << iterateGetSymbol();
189                 data = iterateNext();
190                 i++;
191         }
192         //text[n-1]=0;
193
194         return stream;
195 }
196
197 //void DynFMI::printDynFMIContent(ostream& stream){
198 //      uchar c;
199 //      for (ulong i=1; i<=getSize(); i++) 
200 //      {
201 //              c =(*this)[i];
202 //              if (c==0) c= '#';
203 //              stream << c;
204 //      }
205
206
207 void DynFMI::deleteLeaves(WaveletNode *node){
208         bool leaf = true;
209
210         if (node->left) {
211                 // internal node
212                 leaf = false;
213                 deleteLeaves(node->left);
214                 
215         }
216         if (node->right){
217                 leaf = false;
218                 deleteLeaves(node->right);
219         } 
220         
221         if (leaf) {
222                 // is a leaf, delete it!
223                 if (node->parent) {
224                         if (node==node->parent->left) node->parent->left=0;
225                                 else node->parent->right=0;
226                 }
227                 delete node;
228         }
229 }
230
231 void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
232         #ifndef NDEBUG
233         if (node == node->left) {
234                 cout << "makeCodes: autsch" << endl;
235                 exit(0);
236                 }
237         #endif
238
239         if (node->left) {
240                 makeCodes(code | (0 << bits), bits+1, node->left);
241                 makeCodes(code | (1 << bits), bits+1, node->right);
242         } else {
243                 codes[node->c0] = code;
244                 codelengths[node->c0] = bits+1;
245         }
246 }
247
248 void DynFMI::appendBVTrees(WaveletNode *node){
249         node->bittree = new BVTree();
250
251         if (node->left) appendBVTrees(node->left);
252         if (node->right) appendBVTrees(node->right);
253 }
254
255 void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
256         // pointers to the leaves for select
257         leaves = (WaveletNode**) new WaveletNode*[256];
258         for(int j=0; j<256; j++) leaves[j]=0;
259
260
261         ulong i;
262         for(i=0; i < n; i++) {
263                 if (text[i]==0) {
264                         cerr << "Error: unsigned char 0 is reserved." << endl;
265                         exit(EXIT_FAILURE);
266                 }
267                 
268                 if (leaves[text[i]]==0) {
269                         leaves[text[i]] = new WaveletNode(text[i]); 
270                 }
271                 leaves[text[i]]->weight++; 
272         }
273         
274         // separation symbol:
275         leaves[0] = new WaveletNode((uchar)0); 
276         leaves[0]->weight=x;
277         
278         // Veli's approach:
279         priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
280         
281         
282         for(int j=255; j>=0; j--){
283                 if (leaves[j]!=0) {
284                         q.push( (leaves[j]) );
285                 }
286                 codes[j] = 0;
287                 codelengths[j] = 0;
288         }
289         
290         // creates huffman shape:
291         while (q.size() > 1) {
292                 
293                 WaveletNode *left = q.top();
294                 q.pop();
295                 
296                 WaveletNode *right = q.top();
297                 q.pop();
298                 
299                 q.push(  new WaveletNode(left, right) );
300         }       
301         
302
303         root = q.top();
304         q.pop();
305         
306                         
307         makeCodes(0,0, root);   // writes codes and codelengths
308
309         
310         // merge leaves (one leaf represent two characters!)
311         for(int j=0; j<256; j++){
312         
313                 if (leaves[j]) {
314                 
315                         if (leaves[j]->parent->left==leaves[j]) {
316                                 leaves[j]->parent->c0=j;
317                         } else {
318                                 leaves[j]->parent->c1=j;
319                         }
320                         leaves[j]=leaves[j]->parent; // merge
321                 }
322         }
323         
324         deleteLeaves(root);
325         appendBVTrees(root);
326         
327         // array C needed for backwards search
328         for(int j=0; j<256+256; j++) C[j] = 0;
329         
330 #if SAMPLE!=0
331         this->SampledSAPositionsIndicator = new BVTree();
332         this->sampledSATree = new SampledSATree();
333 #endif
334         
335         this->pos = new Pos(sampleInterval);
336         this->handle = new Handle();
337         
338         pos->handle=this->handle;
339         handle->pos=this->pos;
340         
341 }
342
343 void DynFMI::append(uchar *text, ulong length){
344         ulong i;
345         ulong start = root->bittree->getPositions();
346         for (i = start; i<start+length; i++)
347                 insert(text[i-start], i+1);
348 }
349
350
351 void DynFMI::append(uchar c){
352         insert(c, root->bittree->getPositions()+1);
353 }
354
355 void DynFMI::insert(uchar c, ulong i){
356         #ifndef NDEBUG
357         if (codelengths[c]==0) {
358                 cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
359                 exit(EXIT_FAILURE);
360         }
361         #endif
362         
363         ulong level = 0;
364         ulong code = codes[c];
365
366         bool bit;
367         WaveletNode *walk = root;       
368                 
369         while (walk) {
370                 
371                 bit = ((code & (1u << level)) != 0); 
372                 
373                 walk->bittree->insertBit(bit,i); // TODO improve
374                 i=walk->bittree->rank(bit, i);
375
376                 if (bit) { //bit = 1
377                         walk=walk->right;
378                 } else { // bit = 0
379                         walk=walk->left;
380                 }
381                 
382                 level++;                
383         } // end of while
384         
385         int j = 256+c;
386         while(j>1) {
387                 C[j]++;
388                 j=binaryTree_parent(j);
389                 }
390         C[j]++; 
391         
392 }
393
394 void DynFMI::deleteSymbol(ulong i){
395         WaveletNode *walk = root;       
396         bool bit;
397         uchar c;
398         while (true) {
399
400                 // original slow version:
401                 //bit = (*walk->bittree)[i];
402                 //old_i = i;            
403                 //i=walk->bittree->rank(bit, i);        
404                 //walk->bittree->deleteBit(old_i);
405
406
407                 walk->bittree->deleteBit(i);
408                 bit=walk->bittree->getLastBitDeleted();
409                 i=walk->bittree->getLastRank();
410                 
411                 if (bit) { //bit = 1
412                         if (walk->right == 0) {
413                                 c = walk->c1;
414                                 break;
415                                 }
416                         walk=walk->right;
417                 } else { // bit = 0
418                         if (walk->left == 0) {
419                                 c = walk->c0;
420                                 break;
421                         }
422                         walk=walk->left;
423                 }
424                 
425                 
426         } // end of while
427         
428         int j = 256+c;
429         while(j>1) {
430                 C[j]--;
431                 j=binaryTree_parent(j);
432                 }
433         C[j]--; 
434         
435         
436 }
437
438
439 uchar DynFMI::operator[](ulong i){
440         WaveletNode *walk = root;       
441         bool bit;
442                 
443         while (true) {
444                 
445                 bit = (*walk->bittree)[i]; //TODO improve by reducing
446                 i=walk->bittree->rank(bit, i);
447
448                 if (bit) { //bit = 1
449                         if (walk->right == 0) return walk->c1;
450                         walk=walk->right;
451                 } else { // bit = 0
452                         if (walk->left == 0) return walk->c0;
453                         walk=walk->left;
454                 }
455                 
456                 
457         } // end of while
458         cout << endl;
459 }
460
461 ulong DynFMI::rank(uchar c, ulong i){
462         if (i==0) return 0;
463
464         ulong level = 0;
465         ulong code = codes[c];
466         
467         
468         bool bit;
469         WaveletNode *walk = root;       
470                 
471         while (true) {
472                 
473                 bit = ((code & (1u << level)) != 0);
474                 
475                 i=walk->bittree->rank(bit, i);
476                 if (bit) { //bit = 1
477                         if (walk->right == 0) return i;
478                         walk=walk->right;
479                 } else { // bit = 0
480                         if (walk->left == 0) return i;
481                         walk=walk->left;
482                 }
483         
484                 level++;                
485         } // end of while
486         
487         cerr << "error: DynFMI::rank: left while loop" << endl;
488         exit(EXIT_FAILURE);
489         return 0; //never
490 }
491
492 ulong DynFMI::select(uchar c, ulong i){
493         
494         WaveletNode *walk = leaves[c];  
495         
496         bool bit = (walk->c1==c);
497         
498         while (walk->parent) {
499                 i=walk->bittree->select(bit, i);
500                 
501                 bit = (walk == walk->parent->right);
502                 walk=walk->parent;
503         } // end of while
504         
505         i=walk->bittree->select(bit, i);
506
507         return i;
508 }
509
510
511 ulong DynFMI::count(uchar *pattern){
512
513         ulong i = strlen((char*)pattern);
514         ulong sp = 1;
515
516         ulong ep = getSize();
517         uchar s;
518         while ((sp <= ep) && (i >=1)) {
519                 s=pattern[i-1];
520                 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
521                 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
522                 
523                 i--;
524         }
525         return ep-sp +1;
526 }
527
528 #if SAMPLE!=0
529 ulong* DynFMI::locate(uchar *pattern){
530
531         ulong i = strlen((char*)pattern);
532         ulong sp = 1;
533
534         ulong ep = getSize();
535         uchar s;
536         while ((sp <= ep) && (i >=1)) {
537                 s=pattern[i-1];
538                 sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
539                 ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
540                 
541                 i--;
542         }
543
544         ulong numberOfMatches = ep-sp +1;
545         ulong *matches = new ulong[2*numberOfMatches + 1];
546         matches[0] = numberOfMatches;
547
548         ulong diff;
549         ulong k = 1;
550         cout << "----------------------" << endl;       
551         for (ulong j=sp; j <= ep; j++) {
552
553                 i=j;
554                 diff = 0;
555                 while (!(*SampledSAPositionsIndicator)[i]) {
556                         diff++;
557                         s=(*this)[i];
558                         i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
559         
560                 }
561         
562
563                 sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
564
565                 //cout << "found: (" << sampledSATree->handle  << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
566                 matches[k++] = sampledSATree->handle;
567                 matches[k++] = sampledSATree->sampledSAValue + diff;
568         }
569
570         return matches;
571 }
572 #endif
573
574 // size must include endmarker!
575 ulong DynFMI::addText(uchar const * str, ulong n){
576         ulong i;
577 #if SAMPLE!=0
578         bool sample = false;
579 #endif
580         i=pos->getSize()+1;
581         
582         ulong key = pos->appendText(n);
583
584         // Adding an empty text (end-marker)
585         if (n == 1)
586         {
587             insert(str[0], i);
588             //if (del) delete [] str;
589             return key;
590         }
591
592         insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
593 #if SAMPLE!=0
594         sample = (n-1 % sampleInterval == 0);
595         SampledSAPositionsIndicator->insertBit(sample, i);      
596         if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
597 #endif
598         for (ulong t=n-2; t > 0; t--) {
599                 i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
600                 insert(str[t-1],i);
601 #if SAMPLE!=0
602                 sample = ((t) % sampleInterval == 0);
603                 SampledSAPositionsIndicator->insertBit(sample, i);
604
605                 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
606 #endif
607         }
608         
609
610         i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
611         insert(str[n-1],i);
612 #if SAMPLE!=0
613         SampledSAPositionsIndicator->insertBit(true, i); 
614         sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
615 #endif  
616 //      if (del)  delete [] str;
617         
618         return key;
619 }
620
621
622 ulong DynFMI::addTextFromFile(char* filename, ulong n){
623         ulong i;
624 #if SAMPLE!=0
625         bool sample = false;
626 #endif
627         i=pos->getSize()+1;
628
629
630         std::ifstream str(filename, ios::binary);
631         if (!str)
632         {
633                 cerr << "error reading file: " << filename << endl;
634                 exit(EXIT_FAILURE);
635         }
636
637         char c;
638         //char oldc;
639
640
641         ulong buf_len; //1048576; //10MB
642         char * buffer;
643         ulong buf_pos;
644
645         if (BUFFER==0) buf_len = n;
646                 else buf_len = BUFFER;
647
648         buffer = new char[buf_len];
649         buf_pos = (n-2) / buf_len;
650         str.seekg(buf_pos*buf_len);
651         str.read(buffer, buf_len);
652         str.clear(std::ios_base::goodbit);
653
654
655         ulong key = pos->appendText(n);
656         
657         //str.seekg(n-2);
658
659         //c=str.get();
660         c=buffer[(n-2)%buf_len];
661         insert(c,i); // insert second last character, corresponds to suffix of length 1
662 #if SAMPLE!=0
663         sample = (n-1 % sampleInterval == 0);
664         SampledSAPositionsIndicator->insertBit(sample, i);      
665         if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
666 #endif
667         if ((n-2)%buf_len == 0) {
668                 c=buffer[0];
669                 buf_pos--;
670                 str.seekg(buf_pos*buf_len);
671                 str.read(buffer, buf_len);
672         }
673         for (ulong t=n-2; t > 0; t--) {
674
675                 i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
676
677                 c=buffer[(t-1)%buf_len];
678                 insert(c,i);
679
680                 //check buffer
681                 if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
682 #ifndef NDEBUG
683                         if (buf_pos == 0) {
684                                 cerr << "buf_pos too small" << endl;
685                                 exit(0);
686                         }
687 #endif
688                         buf_pos--;
689                         str.seekg(buf_pos*buf_len);
690                         str.read(buffer, buf_len);
691                 }
692
693 #if SAMPLE!=0
694                 sample = ((t) % sampleInterval == 0);
695                 SampledSAPositionsIndicator->insertBit(sample, i);
696
697                 if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
698 #endif
699         }
700
701         i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
702         insert(c,i);
703
704 #if SAMPLE!=0
705         SampledSAPositionsIndicator->insertBit(true, i); 
706         sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
707 #endif  
708
709         str.close();
710
711         return key;
712 }
713
714
715 uchar* DynFMI::retrieveText(ulong key){
716         cout << "key: " << key << endl;
717         uchar *text=0;
718         // TODO access two times, too much
719         ulong i=handle->getPos(key); //=bwtEnd;
720         if (i == 0 ) {
721                 return text;
722         }
723
724         ulong n=pos->getTextSize(i);
725         
726         text = new uchar[n]; // last byte 0 for cout
727         
728         uchar c;
729         
730 //TODO better   
731         for (ulong t=n-2; t < n; t--) {
732                 c = (*this)[i];
733                 text[t]=(c==0?'#':c);
734                 
735                 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
736         }
737
738         #ifndef NDEBUG          
739         for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
740         #endif  
741         
742         text[n-1] = 0;
743         return text;
744 }
745
746
747 bool DynFMI::deleteText(ulong key){
748         // TODO access two times, can do better
749         ulong i=handle->getPos(key); //=bwtEnd;
750         if (i == 0) return false;
751
752         ulong n=pos->getTextSize(i);
753         ulong *text = new ulong[n]; 
754         uchar c;
755         
756         for (ulong t=n-1; t <n; t--) {
757                 c = (*this)[i];
758                 text[t]=i;
759                 i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
760         }
761
762
763         std::sort(text, text + n);
764
765
766         for (i=n-1; i<n; i--) {
767                 this->deleteSymbol(text[i]); 
768 #if SAMPLE!=0
769                 SampledSAPositionsIndicator->deleteBit(text[i]);                
770 //TODO samples ?
771 #endif
772         }
773         delete[] text;
774         handle->deleteKey(key);
775         
776                 
777         return true;
778 }
779
780 ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
781         int j = 256+c;
782         ulong r=0;
783         while(j>1) {
784                 if (binaryTree_isRightChild(j)) 
785                         r += C[binaryTree_left(binaryTree_parent(j))];
786                 
787                 j=binaryTree_parent(j);
788         }
789         return r;
790 }
791
792
793
794
795 void DynFMI::printDynFMIContent(std::ostream& stream){
796         uchar c;
797         for (ulong i=1; i<=getSize(); i++) 
798         {
799                 c =(*this)[i];
800                 if (c==0) c= '#';
801                 stream << c;
802         }
803 }
804
805
806
807