Debug swcsa
[SXSI/TextCollection.git] / dynFMI.h
1 /***************************************************************************
2  *   Copyright (C) 2006 by Wolfgang Gerlach   *
3  *      *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20
21 // ------ Dynamic Sequence with Indels -----
22 // uses huffman shaped wavelet tree
23 // space: O(nH(0)) operation time: O(log n log \sigma)
24 // papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
25 //           Indexes. CPM 2006, Chapter 3.6 
26
27
28 #include <iostream>
29 #include <cstdlib>
30 #include <fstream>
31 #include <cmath>
32 #include <bitset>
33
34 #ifndef SAMPLE
35 //#define SAMPLE 1024
36 #define SAMPLE 0 // No sampling.
37 #endif
38
39 #ifndef BUFFER
40 #define BUFFER 1048576
41 #endif
42
43
44 #include "bittree.h"
45
46 #if SAMPLE!=0
47 #include "ssatree.h"
48 #endif
49
50
51 #ifndef uchar
52 #define uchar unsigned char
53 #endif
54 #ifndef ulong
55 #define ulong unsigned long
56 #endif
57
58
59 const ulong sampleInterval = SAMPLE;
60
61 class WaveletNode{
62         public:
63         
64         WaveletNode *left;
65         WaveletNode *right;
66         WaveletNode *parent;
67         ulong weight; // used only while construction
68         uchar c0;      // used also while construction
69         uchar c1;
70         
71         BVTree *bittree;
72         
73         WaveletNode(uchar c)
74                 : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){}
75
76         WaveletNode(WaveletNode *left, WaveletNode *right)
77                 : left(left), right(right), parent(0), bittree(0) {
78                 weight = left->weight + right->weight;
79                 left->parent = this;
80                 right->parent = this;
81         }
82         
83         ~WaveletNode(){
84                 delete bittree;
85                 bittree = 0;
86         }
87         
88         bool operator>(const WaveletNode &a) const {
89                 return (weight > a.weight);
90         }
91         
92         
93 };
94
95 namespace std
96 {
97
98 template<> struct greater<WaveletNode*>
99   {
100     bool operator()(WaveletNode const* p1, WaveletNode const* p2)
101     {
102       if(!p1)
103         return false;
104       if(!p2)
105         return true;
106       return *p1 > *p2;
107     }
108   };
109 }
110   
111   
112 class DynFMI{
113         public:
114                 
115                 // read char distribution from string
116                 // x=expected number of texts, n=length of text, del=delete text afterwards
117                 DynFMI(uchar *text, ulong x, ulong n, bool del);
118                 // read char distribution from file
119                 DynFMI(char *readfile);
120                 
121                 ~DynFMI();
122                 
123                                 
124                 
125                 //n=length of str, del=delete text afterwards
126                 void addText(uchar const * str, ulong n);
127                 ulong addTextFromFile(char* filename, ulong n);
128                 uchar* retrieveText(ulong text);
129                 bool deleteText(ulong key);
130                 
131                 ulong count(uchar *pattern);
132 #if SAMPLE!=0
133                 ulong* locate(uchar *pattern);
134 #endif
135                 
136                 //LF(i)-mapping: C[L[i]]+rank_L[i](L,i)
137                 ulong LFmapping(ulong i) {
138                         uchar s=(*this)[i];
139                         return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i);
140                         }
141                 
142
143                 ulong getSize() {return root->bittree->getPositions();}
144                 ulong getCollectionSize(){ return numberOfTexts; }// pos->getSize();}
145                 uchar* getBWT();
146                 std::ostream& getBWTStream(std::ostream& stream);
147
148                 uchar operator[](ulong i);
149                 void printDynFMIContent(std::ostream& stream);
150                 /*ulong* getKeys() {return handle->getKeys(); } 
151                 ulong getPos(ulong i)
152                 { return handle->getPos(i); }*/
153                 
154         private:
155                 ulong numberOfTexts;
156                 WaveletNode *root;
157                 WaveletNode **leaves; // needed for construction and select
158                 
159                 ulong codes[256];
160                 int codelengths[256];
161                 ulong C[256+256];
162                 
163 #if SAMPLE!=0
164                 BVTree *SampledSAPositionsIndicator;
165 #endif          
166                 ulong iterate;
167                                 
168                 //Handle *handle;
169                 //Pos *pos;
170 #if SAMPLE!=0
171                 SampledSATree *sampledSATree;
172 #endif
173
174
175                 ulong rank(uchar c, ulong i);
176                 ulong select(uchar c, ulong i);
177         
178                 void insert(uchar c, ulong i);
179                 void append(uchar c);
180                 void append(uchar *text, ulong length);
181                 void deleteSymbol(ulong i);
182
183
184
185                 // functions
186                 ulong getNumberOfSymbolsSmallerThan(uchar c);
187                 
188                 void initEmptyDynFMI(uchar *text, ulong x, ulong n);
189                 
190                 void makeCodes(ulong code, int bits, WaveletNode *node);
191                 void deleteLeaves(WaveletNode *node);
192                 void appendBVTrees(WaveletNode *node);
193                 
194                 void deleteDynFMINodes(WaveletNode *n);
195                 
196                 //Iterator (public??)
197                 void iterateReset();
198                 bool iterateNext();
199                 uchar iterateGetSymbol();
200                 void recursiveIterateResetWaveletNode(WaveletNode *w);
201                 
202
203                 // small help functions
204                 double log2(double x){
205                         return (log10(x) / log10((double)2));
206                 }
207                 
208                 int binaryTree_parent(int i){
209                         return (int)floor((double)i/2);
210                 }
211
212                 int binaryTree_left(int i){
213                         return 2*i;
214                 }
215
216                 int binaryTree_right(int i){
217                         return 2*i + 1;
218                 }
219
220                 bool binaryTree_isLeftChild(int i){
221                         return (i%2==(int)0);
222                 }
223
224                 bool binaryTree_isRightChild(int i){
225                         return (i%2==(int)1);
226                 }
227                 
228 };
229
230
231
232