1 /***************************************************************************
2 * Copyright (C) 2007 by Veli Mäkinen *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
19 ***************************************************************************/
21 #include "RLWaveletTree.h"
24 void RLWaveletTree::remapAlphabet(uchar *text, ulong n)
26 ulong *chars = new ulong[256/W+1];
28 for (i = 0; i < 256/W+1; ++i)
31 for (i = 0; i < n; ++i)
32 Tools::SetField(chars, 1, text[i], 1);
33 charMap = new BitRank(chars, 256, true);
36 for (i = 0; i < n; ++i)
37 text[i] = charMap->rank(text[i])-1;
40 RLWaveletTree::RLWaveletTree(uchar *D, ulong n) {
42 // std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl;
45 this->maxlevel = Tools::CeilLog2(charMap->rank(255));
48 ulong** bucket = new ulong*[maxlevel+1];
49 ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1];
50 for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++)
53 // allocate memory for storing the locations of boudaries
54 for (k=0; k<=maxlevel; k++) {
55 bucket[k] = (ulong *)new ulong[(ulong)1 << k];
56 for (i=0; i<((ulong)1 << k); i++)
60 // compute the sizes of buckets
63 for (k=0; k<=maxlevel; k++)
64 bucket[k][value >> (maxlevel-k)]++;
67 // cumulate the bucket sizes into boundary locations
68 for (k=0; k<=maxlevel; k++) {
69 for (i=1; i<((ulong)1 << k); i++)
70 bucket[k][i]+=bucket[k][i-1];
71 for (i=((ulong)1 << k)-1; i>=1; i--)
72 bucket[k][i]=bucket[k][i-1];
76 //store the leaf information
77 leaves = new ulong[((ulong)1 << maxlevel)+1];
78 for (k=0; k<(ulong)1<<maxlevel; k++)
79 leaves[k] = bucket[maxlevel][k];
82 // compute wtree using the information about boundaries
85 for (k=0; k<maxlevel; k++)
86 Tools::SetField(wtree,1,k*n+(bucket[k][value >> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1);
89 // deallocate boundaries
90 for (k=0; k<=maxlevel; k++)
94 bitrank = new GapEncode(wtree, n*maxlevel, true);
100 RLWaveletTree::~RLWaveletTree() {
106 ulong RLWaveletTree::rank(ulong c, ulong i) {
107 // if (i >= n) FIXME Replace with assert?
108 // std::cout << "RLWT::rank parameter i out of range" << std::endl;
111 ulong left = 0, right = (ulong)1 << maxlevel;
113 c = charMap->rank(c)-1;
115 // first round separately to avoid extra "if"
116 if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right
117 j = bitrank->rank(j)-1;
120 else { // go to the left
121 j -= bitrank->rank(j);
124 // then the next without the extra if
125 for (k=1;k<maxlevel;k++) {
126 before = bitrank->rank(n*k+leaves[left]-1);
127 if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right
128 j = bitrank->rank(n*k+leaves[left]+j)-before-1;
129 left = (right+left)>>1;
131 else { // go to the left
132 j -= bitrank->rank(n*k+leaves[left]+j)-before;
133 right = (right+left)>>1;
135 // can never happen: if (left > right) return 0;
140 ulong RLWaveletTree::select(ulong c, ulong j) {
141 c = charMap->rank(c)-1;
142 ulong left=c, right=c+1;
143 ulong k, i=j-1, before;
144 for (k=maxlevel-1; k>0;--k) {
145 if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent
147 before = bitrank->rank(n*k+leaves[left]-1);
148 i = bitrank->select(before+i+1)-n*k-leaves[left];
150 else { // left side of parent
152 before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1);
153 i = bitrank->select0(before+i+1)-n*k-leaves[left];
156 // last level separately to avoid one extra if
157 if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent
158 return bitrank->select(i+1);
159 else // left side of parent
160 return bitrank->select0(i+1);
163 bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) {
164 return charAtPos(i)==c;
168 ulong RLWaveletTree::charAtPos(ulong i)
170 // if (i >= n) // FIXME Replace with assert?
171 // std::cout << "RLWT::charAtPos parameter i out of range" << std::endl;
174 ulong left = 0, right = (ulong)1 << maxlevel;
178 // first round separately to avoid extra "if"
179 if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right
180 c = c | ((ulong)1 << (maxlevel-1));
181 j = rank_tmp-1; //bitrank->rank(j)-1;
184 else { // go to the left
186 j -= rank_tmp; // bitrank->rank(j);
189 // then the next without the extra if
190 for (k=1;k<maxlevel;k++) {
191 before = bitrank->rank(n*k+leaves[left]-1);
192 if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
193 c = c | ((ulong)1 << (maxlevel-k-1));
194 j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
195 left = (right+left)>>1;
197 else { // go to the left
199 j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
200 right = (right+left)>>1;
202 // can never happen: if (left > right) return 0;
204 return charMap->select(c+1);
207 /* returns also the rank-1 of char at given position i */
208 ulong RLWaveletTree::charAtPos(ulong i, ulong *rank)
210 // if (i >= n) // FIXME Replace with assert?
211 // std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
214 ulong left = 0, right = (ulong)1 << maxlevel;
218 // first round separately to avoid extra "if"
219 if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ?
220 c = c | ((ulong)1 << (maxlevel-1));
221 j = rank_tmp-1; //bitrank->rank(j)-1;
224 else { // go to the left
226 j -= rank_tmp; //bitrank->rank(j);
229 // then the next without the extra if
230 for (k=1;k<maxlevel;k++) {
231 before = bitrank->rank(n*k+leaves[left]-1);
232 if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
233 c = c | ((ulong)1 << (maxlevel-k-1));
234 j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
235 left = (right+left)>>1;
237 else { // go to the left
239 j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
240 right = (right+left)>>1;
242 // can never happen: if (left > right) return 0;
245 return charMap->select(c+1);
248 /* returns also the rank-1 of char at given position i
249 * and trailing char-run length */
250 ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank)
252 // Check for buffered data from previous call
253 if (i > prevPos && i <= prevPos + prevRunLen)
255 *rank = prevRank + (i-prevPos);
260 // if (i >= n) // FIXME Replace with assert?
261 // std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
264 ulong left = 0, right = (ulong)1 << maxlevel;
269 // first round separately to avoid extra "if"
270 if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) {
271 if (j+prevRunLen > n)
273 c = c | ((ulong)1 << (maxlevel-1));
274 j = rank_tmp-1; //bitrank->rank(j)-1;
277 else { // go to the left
279 if (j+prevRunLen > n)
281 j -= rank_tmp; //bitrank->rank(j);
285 // then the next without the extra if
286 for (k=1;k<maxlevel;k++) {
287 before = bitrank->rank(n*k+leaves[left]-1);
288 if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right
289 if (leaves[left]+j+runl_tmp > leaves[right])
290 runl_tmp = leaves[right] - (leaves[left]+j) - 1;
291 if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
292 prevRunLen = runl_tmp;
293 c = c | ((ulong)1 << (maxlevel-k-1));
294 j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
295 left = (right+left)>>1;
297 else { // go to the left
298 if (leaves[left]+j+runl_tmp > leaves[right])
299 runl_tmp = leaves[right] - (leaves[left]+j) - 1;
300 if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
301 prevRunLen = runl_tmp;
303 j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
304 right = (right+left)>>1;
306 // can never happen: if (left > right) return 0;
310 prevChar = charMap->select(c+1);
311 return charMap->select(c+1);
315 * Saving data fields:
318 ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
322 void RLWaveletTree::Save(FILE *file) const
324 if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
325 throw std::runtime_error("RLWaveletTree::Save(): file write error (n).");
326 if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
327 throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel).");
329 for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
331 if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1)
332 throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves).");
338 ulong *chars = new ulong[256/W+1];
340 for (ulong i = 0; i < 256; ++i)
341 Tools::SetField(chars, 1, i, charMap->IsBitSet(i));
343 for (ulong offset = 0; offset < 256/W+1; ++offset)
345 if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1)
346 throw std::runtime_error("RLWaveletTree::Save(): file write error (chars).");
355 RLWaveletTree::RLWaveletTree(FILE *file)
357 if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
358 throw std::runtime_error("RLWaveletTree::Load(): file read error (n).");
359 if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
360 throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel).");
362 for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
364 if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1)
365 throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves).");
368 bitrank = new GapEncode(file);
371 ulong *chars = new ulong[256/W+1];
373 for (ulong offset = 0; offset < 256/W+1; ++offset)
375 if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1)
376 throw std::runtime_error("RLWaveletTree::Load(): file read error (chars).");
378 charMap = new BitRank(chars, 256, true);
380 prevRunLen = 0; // Init "buffer" values