X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=RLWaveletTree.cpp;fp=RLWaveletTree.cpp;h=dd043cb42eb3402e1b5fdb239dea97fff204b2dd;hb=dbd62fe27e80414b4cf4fbf512ad1910916e6967;hp=0000000000000000000000000000000000000000;hpb=dee0161e4f31f06e9389db9986766395c1b1d2b8;p=SXSI%2FTextCollection.git diff --git a/RLWaveletTree.cpp b/RLWaveletTree.cpp new file mode 100644 index 0000000..dd043cb --- /dev/null +++ b/RLWaveletTree.cpp @@ -0,0 +1,383 @@ +/*************************************************************************** + * Copyright (C) 2007 by Veli Mäkinen * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "RLWaveletTree.h" +using namespace std; + +void RLWaveletTree::remapAlphabet(uchar *text, ulong n) +{ + ulong *chars = new ulong[256/W+1]; + ulong i; + for (i = 0; i < 256/W+1; ++i) + chars[i] = 0; + // Set up flags: + for (i = 0; i < n; ++i) + Tools::SetField(chars, 1, text[i], 1); + charMap = new BitRank(chars, 256, true); + + // Remap text + for (i = 0; i < n; ++i) + text[i] = charMap->rank(text[i])-1; +} + +RLWaveletTree::RLWaveletTree(uchar *D, ulong n) { + remapAlphabet(D, n); +// std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl; + + this->n = n; + this->maxlevel = Tools::CeilLog2(charMap->rank(255)); + + ulong i, k, value; + ulong** bucket = new ulong*[maxlevel+1]; + ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1]; + for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++) + wtree[i] = 0lu; + + // allocate memory for storing the locations of boudaries + for (k=0; k<=maxlevel; k++) { + bucket[k] = (ulong *)new ulong[(ulong)1 << k]; + for (i=0; i<((ulong)1 << k); i++) + bucket[k][i]=0; + } + + // compute the sizes of buckets + for (i=0; i> (maxlevel-k)]++; + } + + // cumulate the bucket sizes into boundary locations + for (k=0; k<=maxlevel; k++) { + for (i=1; i<((ulong)1 << k); i++) + bucket[k][i]+=bucket[k][i-1]; + for (i=((ulong)1 << k)-1; i>=1; i--) + bucket[k][i]=bucket[k][i-1]; + bucket[k][0]=0; + } + + //store the leaf information + leaves = new ulong[((ulong)1 << maxlevel)+1]; + for (k=0; k<(ulong)1<> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1); + } + delete [] D; + // deallocate boundaries + for (k=0; k<=maxlevel; k++) + delete[] bucket[k]; + delete[] bucket; + + bitrank = new GapEncode(wtree, n*maxlevel, true); + prevRunLen = 0; + prevChar = 0; + prevPos = 0; +} + +RLWaveletTree::~RLWaveletTree() { + delete charMap; + delete [] leaves; + delete bitrank; +} + +ulong RLWaveletTree::rank(ulong c, ulong i) { +// if (i >= n) FIXME Replace with assert? +// std::cout << "RLWT::rank parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + c = charMap->rank(c)-1; + + // first round separately to avoid extra "if" + if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right + j = bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + j -= bitrank->rank(j); + right = right>>1; + } + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right + j = bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + j -= bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + return j+1; +} + +ulong RLWaveletTree::select(ulong c, ulong j) { + c = charMap->rank(c)-1; + ulong left=c, right=c+1; + ulong k, i=j-1, before; + for (k=maxlevel-1; k>0;--k) { + if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent + left -= right-left; + before = bitrank->rank(n*k+leaves[left]-1); + i = bitrank->select(before+i+1)-n*k-leaves[left]; + } + else { // left side of parent + right +=right-left; + before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1); + i = bitrank->select0(before+i+1)-n*k-leaves[left]; + } + } + // last level separately to avoid one extra if + if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent + return bitrank->select(i+1); + else // left side of parent + return bitrank->select0(i+1); +} + +bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) { + return charAtPos(i)==c; +} + + +ulong RLWaveletTree::charAtPos(ulong i) +{ +// if (i >= n) // FIXME Replace with assert? +// std::cout << "RLWT::charAtPos parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + ulong c = 0; + ulong rank_tmp = 0; + // first round separately to avoid extra "if" + if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right + c = c | ((ulong)1 << (maxlevel-1)); + j = rank_tmp-1; //bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + // c = c + j -= rank_tmp; // bitrank->rank(j); + right = right>>1; + } + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right + c = c | ((ulong)1 << (maxlevel-k-1)); + j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + // c = c + j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + return charMap->select(c+1); +} + +/* returns also the rank-1 of char at given position i */ +ulong RLWaveletTree::charAtPos(ulong i, ulong *rank) +{ +// if (i >= n) // FIXME Replace with assert? +// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + ulong c = 0; + ulong rank_tmp = 0; + // first round separately to avoid extra "if" + if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ? + c = c | ((ulong)1 << (maxlevel-1)); + j = rank_tmp-1; //bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + // c = c + j -= rank_tmp; //bitrank->rank(j); + right = right>>1; + } + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right + c = c | ((ulong)1 << (maxlevel-k-1)); + j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + // c = c + j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + *rank = j; + return charMap->select(c+1); +} + +/* returns also the rank-1 of char at given position i + * and trailing char-run length */ +ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank) +{ + // Check for buffered data from previous call + if (i > prevPos && i <= prevPos + prevRunLen) + { + *rank = prevRank + (i-prevPos); + return prevChar; + } + + prevPos = i; +// if (i >= n) // FIXME Replace with assert? +// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + ulong c = 0; + ulong rank_tmp = 0; + ulong runl_tmp = 0; + // first round separately to avoid extra "if" + if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) { + if (j+prevRunLen > n) + prevRunLen = n-j-1; + c = c | ((ulong)1 << (maxlevel-1)); + j = rank_tmp-1; //bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + // c = c + if (j+prevRunLen > n) + prevRunLen = n-j-1; + j -= rank_tmp; //bitrank->rank(j); + right = right>>1; + } + + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right + if (leaves[left]+j+runl_tmp > leaves[right]) + runl_tmp = leaves[right] - (leaves[left]+j) - 1; + if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run? + prevRunLen = runl_tmp; + c = c | ((ulong)1 << (maxlevel-k-1)); + j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + if (leaves[left]+j+runl_tmp > leaves[right]) + runl_tmp = leaves[right] - (leaves[left]+j) - 1; + if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run? + prevRunLen = runl_tmp; + // c = c + j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + *rank = j; + prevRank = j; + prevChar = charMap->select(c+1); + return charMap->select(c+1); +} + +/** + * Saving data fields: + ulong n; + ulong maxlevel; + ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1 + GapEncode *bitrank; + BitRank *charMap; +*/ +void RLWaveletTree::Save(FILE *file) const +{ + if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (n)."); + if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel)."); + + for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset) + { + if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves)."); + } + + bitrank->Save(file); + + // Silly + ulong *chars = new ulong[256/W+1]; + // Set up flags: + for (ulong i = 0; i < 256; ++i) + Tools::SetField(chars, 1, i, charMap->IsBitSet(i)); + + for (ulong offset = 0; offset < 256/W+1; ++offset) + { + if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (chars)."); + } + + delete [] chars; +} + +/** + * Load from file + */ +RLWaveletTree::RLWaveletTree(FILE *file) +{ + if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (n)."); + if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel)."); + + for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset) + { + if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves)."); + } + + bitrank = new GapEncode(file); + + // Silly + ulong *chars = new ulong[256/W+1]; + // Read flags + for (ulong offset = 0; offset < 256/W+1; ++offset) + { + if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (chars)."); + } + charMap = new BitRank(chars, 256, true); + + prevRunLen = 0; // Init "buffer" values + prevChar = 0; + prevPos = 0; +}