Initial Import
[SXSI/TextCollection.git] / dynFMI.h
1 /***************************************************************************
2  *   Copyright (C) 2006 by Wolfgang Gerlach   *
3  *      *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20
21 // ------ Dynamic Sequence with Indels -----
22 // uses huffman shaped wavelet tree
23 // space: O(nH(0)) operation time: O(log n log \sigma)
24 // papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
25 //           Indexes. CPM 2006, Chapter 3.6 
26
27
28 #include <iostream>
29 #include <cstdlib>
30 #include <fstream>
31 #include <math.h>
32 #include <bitset>
33
34 #ifndef SAMPLE
35 //#define SAMPLE 1024
36 #define SAMPLE 0 // No sampling.
37 #endif
38
39 #ifndef BUFFER
40 #define BUFFER 1048576
41 #endif
42
43
44 #include "bittree.h"
45 #include "handle.h"
46 #include "pos.h"
47
48 #if SAMPLE!=0
49 #include "ssatree.h"
50 #endif
51
52
53 #ifndef uchar
54 #define uchar unsigned char
55 #endif
56 #ifndef ulong
57 #define ulong unsigned long
58 #endif
59
60
61 const ulong sampleInterval = SAMPLE;
62
63 class WaveletNode{
64         public:
65         
66         WaveletNode *left;
67         WaveletNode *right;
68         WaveletNode *parent;
69         ulong weight; // used only while construction
70         uchar c0;      // used also while construction
71         uchar c1;
72         
73         BVTree *bittree;
74         
75         WaveletNode(uchar c)
76                 : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){}
77
78         WaveletNode(WaveletNode *left, WaveletNode *right)
79                 : left(left), right(right), parent(0), bittree(0) {
80                 weight = left->weight + right->weight;
81                 left->parent = this;
82                 right->parent = this;
83         }
84         
85         ~WaveletNode(){
86                 delete bittree;
87                 bittree = 0;
88         }
89         
90         bool operator>(const WaveletNode &a) const {
91                 return (weight > a.weight);
92         }
93         
94         
95 };
96
97 namespace std
98 {
99
100 template<> struct greater<WaveletNode*>
101   {
102     bool operator()(WaveletNode const* p1, WaveletNode const* p2)
103     {
104       if(!p1)
105         return false;
106       if(!p2)
107         return true;
108       return *p1 > *p2;
109     }
110   };
111 }
112   
113   
114 class DynFMI{
115         public:
116                 
117                 // read char distribution from string
118                 // x=expected number of texts, n=length of text, del=delete text afterwards
119                 DynFMI(uchar *text, ulong x, ulong n, bool del);
120                 // read char distribution from file
121                 DynFMI(char *readfile);
122                 
123                 ~DynFMI();
124                 
125                                 
126                 
127                 //n=length of str, del=delete text afterwards
128                 ulong addText(uchar const * str, ulong n);
129                 ulong addTextFromFile(char* filename, ulong n);
130                 uchar* retrieveText(ulong text);
131                 bool deleteText(ulong key);
132                 
133                 ulong count(uchar *pattern);
134 #if SAMPLE!=0
135                 ulong* locate(uchar *pattern);
136 #endif
137                 
138                 //LF(i)-mapping: C[L[i]]+rank_L[i](L,i)
139                 ulong LFmapping(ulong i) {
140                         uchar s=(*this)[i];
141                         return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i);
142                         }
143                 
144
145                 ulong getSize() {return root->bittree->getPositions();}
146                 ulong getCollectionSize(){ return pos->getSize();}
147                 uchar* getBWT();
148                 std::ostream& getBWTStream(std::ostream& stream);
149
150                 uchar operator[](ulong i);
151                 void printDynFMIContent(std::ostream& stream);
152                 ulong* getKeys() {return handle->getKeys(); }   
153                 ulong getPos(ulong i)
154                 { return handle->getPos(i); }
155                 
156         private:
157                 WaveletNode *root;
158                 WaveletNode **leaves; // needed for construction and select
159                 
160                 ulong codes[256];
161                 int codelengths[256];
162                 ulong C[256+256];
163                 
164 #if SAMPLE!=0
165                 BVTree *SampledSAPositionsIndicator;
166 #endif          
167                 ulong iterate;
168                                 
169                 Handle *handle;
170                 Pos *pos;
171 #if SAMPLE!=0
172                 SampledSATree *sampledSATree;
173 #endif
174
175
176                 ulong rank(uchar c, ulong i);
177                 ulong select(uchar c, ulong i);
178         
179                 void insert(uchar c, ulong i);
180                 void append(uchar c);
181                 void append(uchar *text, ulong length);
182                 void deleteSymbol(ulong i);
183
184
185
186                 // functions
187                 ulong getNumberOfSymbolsSmallerThan(uchar c);
188                 
189                 void initEmptyDynFMI(uchar *text, ulong x, ulong n);
190                 
191                 void makeCodes(ulong code, int bits, WaveletNode *node);
192                 void deleteLeaves(WaveletNode *node);
193                 void appendBVTrees(WaveletNode *node);
194                 
195                 void deleteDynFMINodes(WaveletNode *n);
196                 
197                 //Iterator (public??)
198                 void iterateReset();
199                 bool iterateNext();
200                 uchar iterateGetSymbol();
201                 void recursiveIterateResetWaveletNode(WaveletNode *w);
202                 
203
204                 // small help functions
205                 double log2(double x){
206                         return (log10(x) / log10((double)2));
207                 }
208                 
209                 int binaryTree_parent(int i){
210                         return (int)floor((double)i/2);
211                 }
212
213                 int binaryTree_left(int i){
214                         return 2*i;
215                 }
216
217                 int binaryTree_right(int i){
218                         return 2*i + 1;
219                 }
220
221                 bool binaryTree_isLeftChild(int i){
222                         return (i%2==(int)0);
223                 }
224
225                 bool binaryTree_isRightChild(int i){
226                         return (i%2==(int)1);
227                 }
228                 
229 };
230
231
232
233