1 /***************************************************************************
2 * Copyright (C) 2006 by Wolfgang Gerlach *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
21 // ------ Dynamic Sequence with Indels -----
22 // uses huffman shaped wavelet tree
23 // space: O(nH(0)) operation time: O(log n log \sigma)
24 // papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
25 // Indexes. CPM 2006, Chapter 3.6
36 #define SAMPLE 0 // No sampling.
40 #define BUFFER 1048576
54 #define uchar unsigned char
57 #define ulong unsigned long
61 const ulong sampleInterval = SAMPLE;
69 ulong weight; // used only while construction
70 uchar c0; // used also while construction
76 : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){}
78 WaveletNode(WaveletNode *left, WaveletNode *right)
79 : left(left), right(right), parent(0), bittree(0) {
80 weight = left->weight + right->weight;
90 bool operator>(const WaveletNode &a) const {
91 return (weight > a.weight);
100 template<> struct greater<WaveletNode*>
102 bool operator()(WaveletNode const* p1, WaveletNode const* p2)
117 // read char distribution from string
118 // x=expected number of texts, n=length of text, del=delete text afterwards
119 DynFMI(uchar *text, ulong x, ulong n, bool del);
120 // read char distribution from file
121 DynFMI(char *readfile);
127 //n=length of str, del=delete text afterwards
128 ulong addText(uchar const * str, ulong n);
129 ulong addTextFromFile(char* filename, ulong n);
130 uchar* retrieveText(ulong text);
131 bool deleteText(ulong key);
133 ulong count(uchar *pattern);
135 ulong* locate(uchar *pattern);
138 //LF(i)-mapping: C[L[i]]+rank_L[i](L,i)
139 ulong LFmapping(ulong i) {
141 return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i);
145 ulong getSize() {return root->bittree->getPositions();}
146 ulong getCollectionSize(){ return pos->getSize();}
148 std::ostream& getBWTStream(std::ostream& stream);
150 uchar operator[](ulong i);
151 void printDynFMIContent(std::ostream& stream);
152 ulong* getKeys() {return handle->getKeys(); }
153 ulong getPos(ulong i)
154 { return handle->getPos(i); }
158 WaveletNode **leaves; // needed for construction and select
161 int codelengths[256];
165 BVTree *SampledSAPositionsIndicator;
172 SampledSATree *sampledSATree;
176 ulong rank(uchar c, ulong i);
177 ulong select(uchar c, ulong i);
179 void insert(uchar c, ulong i);
180 void append(uchar c);
181 void append(uchar *text, ulong length);
182 void deleteSymbol(ulong i);
187 ulong getNumberOfSymbolsSmallerThan(uchar c);
189 void initEmptyDynFMI(uchar *text, ulong x, ulong n);
191 void makeCodes(ulong code, int bits, WaveletNode *node);
192 void deleteLeaves(WaveletNode *node);
193 void appendBVTrees(WaveletNode *node);
195 void deleteDynFMINodes(WaveletNode *n);
197 //Iterator (public??)
200 uchar iterateGetSymbol();
201 void recursiveIterateResetWaveletNode(WaveletNode *w);
204 // small help functions
205 double log2(double x){
206 return (log10(x) / log10((double)2));
209 int binaryTree_parent(int i){
210 return (int)floor((double)i/2);
213 int binaryTree_left(int i){
217 int binaryTree_right(int i){
221 bool binaryTree_isLeftChild(int i){
222 return (i%2==(int)0);
225 bool binaryTree_isRightChild(int i){
226 return (i%2==(int)1);