+++ /dev/null
-/***************************************************************************
- * Copyright (C) 2007 by Veli Mäkinen *
- * *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU General Public License as published by *
- * the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU General Public License for more details. *
- * *
- * You should have received a copy of the GNU General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
- ***************************************************************************/
-
-#include "RLWaveletTree.h"
-#include <stdexcept>
-using namespace std;
-
-void RLWaveletTree::remapAlphabet(uchar *text, ulong n)
-{
- ulong *chars = new ulong[256/W+1];
- ulong i;
- for (i = 0; i < 256/W+1; ++i)
- chars[i] = 0;
- // Set up flags:
- for (i = 0; i < n; ++i)
- Tools::SetField(chars, 1, text[i], 1);
- charMap = new BitRank(chars, 256, true);
-
- // Remap text
- for (i = 0; i < n; ++i)
- text[i] = charMap->rank(text[i])-1;
-}
-
-RLWaveletTree::RLWaveletTree(uchar *D, ulong n) {
- remapAlphabet(D, n);
-// std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl;
-
- this->n = n;
- this->maxlevel = Tools::CeilLog2(charMap->rank(255));
-
- ulong i, k, value;
- ulong** bucket = new ulong*[maxlevel+1];
- ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1];
- for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++)
- wtree[i] = 0lu;
-
- // allocate memory for storing the locations of boudaries
- for (k=0; k<=maxlevel; k++) {
- bucket[k] = (ulong *)new ulong[(ulong)1 << k];
- for (i=0; i<((ulong)1 << k); i++)
- bucket[k][i]=0;
- }
-
- // compute the sizes of buckets
- for (i=0; i<n; i++) {
- value = D[i];
- for (k=0; k<=maxlevel; k++)
- bucket[k][value >> (maxlevel-k)]++;
- }
-
- // cumulate the bucket sizes into boundary locations
- for (k=0; k<=maxlevel; k++) {
- for (i=1; i<((ulong)1 << k); i++)
- bucket[k][i]+=bucket[k][i-1];
- for (i=((ulong)1 << k)-1; i>=1; i--)
- bucket[k][i]=bucket[k][i-1];
- bucket[k][0]=0;
- }
-
- //store the leaf information
- leaves = new ulong[((ulong)1 << maxlevel)+1];
- for (k=0; k<(ulong)1<<maxlevel; k++)
- leaves[k] = bucket[maxlevel][k];
- leaves[k]=n;
-
- // compute wtree using the information about boundaries
- for (i=0; i<n; i++) {
- value = D[i];
- for (k=0; k<maxlevel; k++)
- Tools::SetField(wtree,1,k*n+(bucket[k][value >> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1);
- }
- delete [] D;
- // deallocate boundaries
- for (k=0; k<=maxlevel; k++)
- delete[] bucket[k];
- delete[] bucket;
-
- bitrank = new GapEncode(wtree, n*maxlevel, true);
- prevRunLen = 0;
- prevChar = 0;
- prevPos = 0;
-}
-
-RLWaveletTree::~RLWaveletTree() {
- delete charMap;
- delete [] leaves;
- delete bitrank;
-}
-
-ulong RLWaveletTree::rank(ulong c, ulong i) {
-// if (i >= n) FIXME Replace with assert?
-// std::cout << "RLWT::rank parameter i out of range" << std::endl;
- ulong k;
- ulong j=i;
- ulong left = 0, right = (ulong)1 << maxlevel;
- ulong before=0;
- c = charMap->rank(c)-1;
-
- // first round separately to avoid extra "if"
- if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right
- j = bitrank->rank(j)-1;
- left = right>>1;
- }
- else { // go to the left
- j -= bitrank->rank(j);
- right = right>>1;
- }
- // then the next without the extra if
- for (k=1;k<maxlevel;k++) {
- before = bitrank->rank(n*k+leaves[left]-1);
- if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right
- j = bitrank->rank(n*k+leaves[left]+j)-before-1;
- left = (right+left)>>1;
- }
- else { // go to the left
- j -= bitrank->rank(n*k+leaves[left]+j)-before;
- right = (right+left)>>1;
- }
- // can never happen: if (left > right) return 0;
- }
- return j+1;
-}
-
-ulong RLWaveletTree::select(ulong c, ulong j) {
- c = charMap->rank(c)-1;
- ulong left=c, right=c+1;
- ulong k, i=j-1, before;
- for (k=maxlevel-1; k>0;--k) {
- if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent
- left -= right-left;
- before = bitrank->rank(n*k+leaves[left]-1);
- i = bitrank->select(before+i+1)-n*k-leaves[left];
- }
- else { // left side of parent
- right +=right-left;
- before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1);
- i = bitrank->select0(before+i+1)-n*k-leaves[left];
- }
- }
- // last level separately to avoid one extra if
- if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent
- return bitrank->select(i+1);
- else // left side of parent
- return bitrank->select0(i+1);
-}
-
-bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) {
- return charAtPos(i)==c;
-}
-
-
-ulong RLWaveletTree::charAtPos(ulong i)
-{
-// if (i >= n) // FIXME Replace with assert?
-// std::cout << "RLWT::charAtPos parameter i out of range" << std::endl;
- ulong k;
- ulong j=i;
- ulong left = 0, right = (ulong)1 << maxlevel;
- ulong before=0;
- ulong c = 0;
- ulong rank_tmp = 0;
- // first round separately to avoid extra "if"
- if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right
- c = c | ((ulong)1 << (maxlevel-1));
- j = rank_tmp-1; //bitrank->rank(j)-1;
- left = right>>1;
- }
- else { // go to the left
- // c = c
- j -= rank_tmp; // bitrank->rank(j);
- right = right>>1;
- }
- // then the next without the extra if
- for (k=1;k<maxlevel;k++) {
- before = bitrank->rank(n*k+leaves[left]-1);
- if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
- c = c | ((ulong)1 << (maxlevel-k-1));
- j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
- left = (right+left)>>1;
- }
- else { // go to the left
- // c = c
- j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
- right = (right+left)>>1;
- }
- // can never happen: if (left > right) return 0;
- }
- return charMap->select(c+1);
-}
-
-/* returns also the rank-1 of char at given position i */
-ulong RLWaveletTree::charAtPos(ulong i, ulong *rank)
-{
-// if (i >= n) // FIXME Replace with assert?
-// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
- ulong k;
- ulong j=i;
- ulong left = 0, right = (ulong)1 << maxlevel;
- ulong before=0;
- ulong c = 0;
- ulong rank_tmp = 0;
- // first round separately to avoid extra "if"
- if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ?
- c = c | ((ulong)1 << (maxlevel-1));
- j = rank_tmp-1; //bitrank->rank(j)-1;
- left = right>>1;
- }
- else { // go to the left
- // c = c
- j -= rank_tmp; //bitrank->rank(j);
- right = right>>1;
- }
- // then the next without the extra if
- for (k=1;k<maxlevel;k++) {
- before = bitrank->rank(n*k+leaves[left]-1);
- if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
- c = c | ((ulong)1 << (maxlevel-k-1));
- j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
- left = (right+left)>>1;
- }
- else { // go to the left
- // c = c
- j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
- right = (right+left)>>1;
- }
- // can never happen: if (left > right) return 0;
- }
- *rank = j;
- return charMap->select(c+1);
-}
-
-/* returns also the rank-1 of char at given position i
- * and trailing char-run length */
-ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank)
-{
- // Check for buffered data from previous call
- if (i > prevPos && i <= prevPos + prevRunLen)
- {
- *rank = prevRank + (i-prevPos);
- return prevChar;
- }
-
- prevPos = i;
-// if (i >= n) // FIXME Replace with assert?
-// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
- ulong k;
- ulong j=i;
- ulong left = 0, right = (ulong)1 << maxlevel;
- ulong before=0;
- ulong c = 0;
- ulong rank_tmp = 0;
- ulong runl_tmp = 0;
- // first round separately to avoid extra "if"
- if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) {
- if (j+prevRunLen > n)
- prevRunLen = n-j-1;
- c = c | ((ulong)1 << (maxlevel-1));
- j = rank_tmp-1; //bitrank->rank(j)-1;
- left = right>>1;
- }
- else { // go to the left
- // c = c
- if (j+prevRunLen > n)
- prevRunLen = n-j-1;
- j -= rank_tmp; //bitrank->rank(j);
- right = right>>1;
- }
-
- // then the next without the extra if
- for (k=1;k<maxlevel;k++) {
- before = bitrank->rank(n*k+leaves[left]-1);
- if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right
- if (leaves[left]+j+runl_tmp > leaves[right])
- runl_tmp = leaves[right] - (leaves[left]+j) - 1;
- if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
- prevRunLen = runl_tmp;
- c = c | ((ulong)1 << (maxlevel-k-1));
- j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
- left = (right+left)>>1;
- }
- else { // go to the left
- if (leaves[left]+j+runl_tmp > leaves[right])
- runl_tmp = leaves[right] - (leaves[left]+j) - 1;
- if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
- prevRunLen = runl_tmp;
- // c = c
- j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
- right = (right+left)>>1;
- }
- // can never happen: if (left > right) return 0;
- }
- *rank = j;
- prevRank = j;
- prevChar = charMap->select(c+1);
- return charMap->select(c+1);
-}
-
-/**
- * Saving data fields:
- ulong n;
- ulong maxlevel;
- ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
- GapEncode *bitrank;
- BitRank *charMap;
-*/
-void RLWaveletTree::Save(FILE *file) const
-{
- if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Save(): file write error (n).");
- if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel).");
-
- for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
- {
- if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves).");
- }
-
- bitrank->Save(file);
-
- // Silly
- ulong *chars = new ulong[256/W+1];
- // Set up flags:
- for (ulong i = 0; i < 256; ++i)
- Tools::SetField(chars, 1, i, charMap->IsBitSet(i));
-
- for (ulong offset = 0; offset < 256/W+1; ++offset)
- {
- if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Save(): file write error (chars).");
- }
-
- delete [] chars;
-}
-
-/**
- * Load from file
- */
-RLWaveletTree::RLWaveletTree(FILE *file)
-{
- if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Load(): file read error (n).");
- if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel).");
-
- for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
- {
- if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves).");
- }
-
- bitrank = new GapEncode(file);
-
- // Silly
- ulong *chars = new ulong[256/W+1];
- // Read flags
- for (ulong offset = 0; offset < 256/W+1; ++offset)
- {
- if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("RLWaveletTree::Load(): file read error (chars).");
- }
- charMap = new BitRank(chars, 256, true);
-
- prevRunLen = 0; // Init "buffer" values
- prevChar = 0;
- prevPos = 0;
-}