1 /***************************************************************************
2 * Copyright (C) 2006 by Wolfgang Gerlach *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
21 // ------ Dynamic Sequence with Indels -----
22 // uses huffman shaped wavelet tree
23 // space: O(nH(0)) operation time: O(log n log \sigma)
24 // papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
25 // Indexes. CPM 2006, Chapter 3.6
36 #define SAMPLE 0 // No sampling.
40 #define BUFFER 1048576
52 #define uchar unsigned char
55 #define ulong unsigned long
59 const ulong sampleInterval = SAMPLE;
67 ulong weight; // used only while construction
68 uchar c0; // used also while construction
74 : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){}
76 WaveletNode(WaveletNode *left, WaveletNode *right)
77 : left(left), right(right), parent(0), bittree(0) {
78 weight = left->weight + right->weight;
88 bool operator>(const WaveletNode &a) const {
89 return (weight > a.weight);
98 template<> struct greater<WaveletNode*>
100 bool operator()(WaveletNode const* p1, WaveletNode const* p2)
115 // read char distribution from string
116 // x=expected number of texts, n=length of text, del=delete text afterwards
117 DynFMI(uchar *text, ulong x, ulong n, bool del);
118 // read char distribution from file
119 DynFMI(char *readfile);
125 //n=length of str, del=delete text afterwards
126 void addText(uchar const * str, ulong n);
127 ulong addTextFromFile(char* filename, ulong n);
128 uchar* retrieveText(ulong text);
129 bool deleteText(ulong key);
131 ulong count(uchar *pattern);
133 ulong* locate(uchar *pattern);
136 //LF(i)-mapping: C[L[i]]+rank_L[i](L,i)
137 ulong LFmapping(ulong i) {
139 return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i);
143 ulong getSize() {return root->bittree->getPositions();}
144 ulong getCollectionSize(){ return numberOfTexts; }// pos->getSize();}
146 std::ostream& getBWTStream(std::ostream& stream);
148 uchar operator[](ulong i);
149 void printDynFMIContent(std::ostream& stream);
150 /*ulong* getKeys() {return handle->getKeys(); }
151 ulong getPos(ulong i)
152 { return handle->getPos(i); }*/
157 WaveletNode **leaves; // needed for construction and select
160 int codelengths[256];
164 BVTree *SampledSAPositionsIndicator;
171 SampledSATree *sampledSATree;
175 ulong rank(uchar c, ulong i);
176 ulong select(uchar c, ulong i);
178 void insert(uchar c, ulong i);
179 void append(uchar c);
180 void append(uchar *text, ulong length);
181 void deleteSymbol(ulong i);
186 ulong getNumberOfSymbolsSmallerThan(uchar c);
188 void initEmptyDynFMI(uchar *text, ulong x, ulong n);
190 void makeCodes(ulong code, int bits, WaveletNode *node);
191 void deleteLeaves(WaveletNode *node);
192 void appendBVTrees(WaveletNode *node);
194 void deleteDynFMINodes(WaveletNode *n);
196 //Iterator (public??)
199 uchar iterateGetSymbol();
200 void recursiveIterateResetWaveletNode(WaveletNode *w);
203 // small help functions
204 double log2(double x){
205 return (log10(x) / log10((double)2));
208 int binaryTree_parent(int i){
209 return (int)floor((double)i/2);
212 int binaryTree_left(int i){
216 int binaryTree_right(int i){
220 bool binaryTree_isLeftChild(int i){
221 return (i%2==(int)0);
224 bool binaryTree_isRightChild(int i){
225 return (i%2==(int)1);