X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;ds=sidebyside;f=RLWaveletTree.cpp;fp=RLWaveletTree.cpp;h=0000000000000000000000000000000000000000;hb=26060a4257e11ded60e950b1ffd18a3473dfe63c;hp=6b8d23208d4b87e814e87bcd84271c4f9ff4c4fa;hpb=f69aab1cc5ad9a0423dd85a93e70ddcf97075ab4;p=SXSI%2FTextCollection.git diff --git a/RLWaveletTree.cpp b/RLWaveletTree.cpp deleted file mode 100644 index 6b8d232..0000000 --- a/RLWaveletTree.cpp +++ /dev/null @@ -1,384 +0,0 @@ -/*************************************************************************** - * Copyright (C) 2007 by Veli Mäkinen * - * * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU General Public License as published by * - * the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU General Public License for more details. * - * * - * You should have received a copy of the GNU General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - ***************************************************************************/ - -#include "RLWaveletTree.h" -#include -using namespace std; - -void RLWaveletTree::remapAlphabet(uchar *text, ulong n) -{ - ulong *chars = new ulong[256/W+1]; - ulong i; - for (i = 0; i < 256/W+1; ++i) - chars[i] = 0; - // Set up flags: - for (i = 0; i < n; ++i) - Tools::SetField(chars, 1, text[i], 1); - charMap = new BitRank(chars, 256, true); - - // Remap text - for (i = 0; i < n; ++i) - text[i] = charMap->rank(text[i])-1; -} - -RLWaveletTree::RLWaveletTree(uchar *D, ulong n) { - remapAlphabet(D, n); -// std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl; - - this->n = n; - this->maxlevel = Tools::CeilLog2(charMap->rank(255)); - - ulong i, k, value; - ulong** bucket = new ulong*[maxlevel+1]; - ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1]; - for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++) - wtree[i] = 0lu; - - // allocate memory for storing the locations of boudaries - for (k=0; k<=maxlevel; k++) { - bucket[k] = (ulong *)new ulong[(ulong)1 << k]; - for (i=0; i<((ulong)1 << k); i++) - bucket[k][i]=0; - } - - // compute the sizes of buckets - for (i=0; i> (maxlevel-k)]++; - } - - // cumulate the bucket sizes into boundary locations - for (k=0; k<=maxlevel; k++) { - for (i=1; i<((ulong)1 << k); i++) - bucket[k][i]+=bucket[k][i-1]; - for (i=((ulong)1 << k)-1; i>=1; i--) - bucket[k][i]=bucket[k][i-1]; - bucket[k][0]=0; - } - - //store the leaf information - leaves = new ulong[((ulong)1 << maxlevel)+1]; - for (k=0; k<(ulong)1<> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1); - } - delete [] D; - // deallocate boundaries - for (k=0; k<=maxlevel; k++) - delete[] bucket[k]; - delete[] bucket; - - bitrank = new GapEncode(wtree, n*maxlevel, true); - prevRunLen = 0; - prevChar = 0; - prevPos = 0; -} - -RLWaveletTree::~RLWaveletTree() { - delete charMap; - delete [] leaves; - delete bitrank; -} - -ulong RLWaveletTree::rank(ulong c, ulong i) { -// if (i >= n) FIXME Replace with assert? -// std::cout << "RLWT::rank parameter i out of range" << std::endl; - ulong k; - ulong j=i; - ulong left = 0, right = (ulong)1 << maxlevel; - ulong before=0; - c = charMap->rank(c)-1; - - // first round separately to avoid extra "if" - if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right - j = bitrank->rank(j)-1; - left = right>>1; - } - else { // go to the left - j -= bitrank->rank(j); - right = right>>1; - } - // then the next without the extra if - for (k=1;krank(n*k+leaves[left]-1); - if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right - j = bitrank->rank(n*k+leaves[left]+j)-before-1; - left = (right+left)>>1; - } - else { // go to the left - j -= bitrank->rank(n*k+leaves[left]+j)-before; - right = (right+left)>>1; - } - // can never happen: if (left > right) return 0; - } - return j+1; -} - -ulong RLWaveletTree::select(ulong c, ulong j) { - c = charMap->rank(c)-1; - ulong left=c, right=c+1; - ulong k, i=j-1, before; - for (k=maxlevel-1; k>0;--k) { - if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent - left -= right-left; - before = bitrank->rank(n*k+leaves[left]-1); - i = bitrank->select(before+i+1)-n*k-leaves[left]; - } - else { // left side of parent - right +=right-left; - before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1); - i = bitrank->select0(before+i+1)-n*k-leaves[left]; - } - } - // last level separately to avoid one extra if - if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent - return bitrank->select(i+1); - else // left side of parent - return bitrank->select0(i+1); -} - -bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) { - return charAtPos(i)==c; -} - - -ulong RLWaveletTree::charAtPos(ulong i) -{ -// if (i >= n) // FIXME Replace with assert? -// std::cout << "RLWT::charAtPos parameter i out of range" << std::endl; - ulong k; - ulong j=i; - ulong left = 0, right = (ulong)1 << maxlevel; - ulong before=0; - ulong c = 0; - ulong rank_tmp = 0; - // first round separately to avoid extra "if" - if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right - c = c | ((ulong)1 << (maxlevel-1)); - j = rank_tmp-1; //bitrank->rank(j)-1; - left = right>>1; - } - else { // go to the left - // c = c - j -= rank_tmp; // bitrank->rank(j); - right = right>>1; - } - // then the next without the extra if - for (k=1;krank(n*k+leaves[left]-1); - if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right - c = c | ((ulong)1 << (maxlevel-k-1)); - j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; - left = (right+left)>>1; - } - else { // go to the left - // c = c - j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; - right = (right+left)>>1; - } - // can never happen: if (left > right) return 0; - } - return charMap->select(c+1); -} - -/* returns also the rank-1 of char at given position i */ -ulong RLWaveletTree::charAtPos(ulong i, ulong *rank) -{ -// if (i >= n) // FIXME Replace with assert? -// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl; - ulong k; - ulong j=i; - ulong left = 0, right = (ulong)1 << maxlevel; - ulong before=0; - ulong c = 0; - ulong rank_tmp = 0; - // first round separately to avoid extra "if" - if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ? - c = c | ((ulong)1 << (maxlevel-1)); - j = rank_tmp-1; //bitrank->rank(j)-1; - left = right>>1; - } - else { // go to the left - // c = c - j -= rank_tmp; //bitrank->rank(j); - right = right>>1; - } - // then the next without the extra if - for (k=1;krank(n*k+leaves[left]-1); - if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right - c = c | ((ulong)1 << (maxlevel-k-1)); - j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; - left = (right+left)>>1; - } - else { // go to the left - // c = c - j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; - right = (right+left)>>1; - } - // can never happen: if (left > right) return 0; - } - *rank = j; - return charMap->select(c+1); -} - -/* returns also the rank-1 of char at given position i - * and trailing char-run length */ -ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank) -{ - // Check for buffered data from previous call - if (i > prevPos && i <= prevPos + prevRunLen) - { - *rank = prevRank + (i-prevPos); - return prevChar; - } - - prevPos = i; -// if (i >= n) // FIXME Replace with assert? -// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl; - ulong k; - ulong j=i; - ulong left = 0, right = (ulong)1 << maxlevel; - ulong before=0; - ulong c = 0; - ulong rank_tmp = 0; - ulong runl_tmp = 0; - // first round separately to avoid extra "if" - if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) { - if (j+prevRunLen > n) - prevRunLen = n-j-1; - c = c | ((ulong)1 << (maxlevel-1)); - j = rank_tmp-1; //bitrank->rank(j)-1; - left = right>>1; - } - else { // go to the left - // c = c - if (j+prevRunLen > n) - prevRunLen = n-j-1; - j -= rank_tmp; //bitrank->rank(j); - right = right>>1; - } - - // then the next without the extra if - for (k=1;krank(n*k+leaves[left]-1); - if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right - if (leaves[left]+j+runl_tmp > leaves[right]) - runl_tmp = leaves[right] - (leaves[left]+j) - 1; - if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run? - prevRunLen = runl_tmp; - c = c | ((ulong)1 << (maxlevel-k-1)); - j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; - left = (right+left)>>1; - } - else { // go to the left - if (leaves[left]+j+runl_tmp > leaves[right]) - runl_tmp = leaves[right] - (leaves[left]+j) - 1; - if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run? - prevRunLen = runl_tmp; - // c = c - j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; - right = (right+left)>>1; - } - // can never happen: if (left > right) return 0; - } - *rank = j; - prevRank = j; - prevChar = charMap->select(c+1); - return charMap->select(c+1); -} - -/** - * Saving data fields: - ulong n; - ulong maxlevel; - ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1 - GapEncode *bitrank; - BitRank *charMap; -*/ -void RLWaveletTree::Save(FILE *file) const -{ - if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Save(): file write error (n)."); - if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel)."); - - for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset) - { - if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves)."); - } - - bitrank->Save(file); - - // Silly - ulong *chars = new ulong[256/W+1]; - // Set up flags: - for (ulong i = 0; i < 256; ++i) - Tools::SetField(chars, 1, i, charMap->IsBitSet(i)); - - for (ulong offset = 0; offset < 256/W+1; ++offset) - { - if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Save(): file write error (chars)."); - } - - delete [] chars; -} - -/** - * Load from file - */ -RLWaveletTree::RLWaveletTree(FILE *file) -{ - if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Load(): file read error (n)."); - if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel)."); - - for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset) - { - if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves)."); - } - - bitrank = new GapEncode(file); - - // Silly - ulong *chars = new ulong[256/W+1]; - // Read flags - for (ulong offset = 0; offset < 256/W+1; ++offset) - { - if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1) - throw std::runtime_error("RLWaveletTree::Load(): file read error (chars)."); - } - charMap = new BitRank(chars, 256, true); - - prevRunLen = 0; // Init "buffer" values - prevChar = 0; - prevPos = 0; -}