Update: libcds WT, empty texts in bitv.
[SXSI/TextCollection.git] / RLWaveletTree.cpp
diff --git a/RLWaveletTree.cpp b/RLWaveletTree.cpp
new file mode 100644 (file)
index 0000000..dd043cb
--- /dev/null
@@ -0,0 +1,383 @@
+/***************************************************************************
+ *   Copyright (C) 2007 by Veli Mäkinen                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.         *
+ ***************************************************************************/
+#include "RLWaveletTree.h"
+using namespace std;
+
+void RLWaveletTree::remapAlphabet(uchar *text, ulong n)
+{
+    ulong *chars = new ulong[256/W+1];
+    ulong i;
+    for (i = 0; i < 256/W+1; ++i)
+        chars[i] = 0;
+    // Set up flags:
+    for (i = 0; i < n; ++i)
+        Tools::SetField(chars, 1, text[i], 1);
+    charMap = new BitRank(chars, 256, true);
+
+    // Remap text
+    for (i = 0; i < n; ++i)
+        text[i] = charMap->rank(text[i])-1;
+}
+
+RLWaveletTree::RLWaveletTree(uchar *D, ulong n) {
+    remapAlphabet(D, n);
+//    std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl;
+
+    this->n = n;
+    this->maxlevel = Tools::CeilLog2(charMap->rank(255));
+
+    ulong i, k, value;
+    ulong** bucket = new ulong*[maxlevel+1];
+    ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1];
+    for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++)
+        wtree[i] = 0lu;
+
+   // allocate memory for storing the locations of boudaries 
+    for (k=0; k<=maxlevel; k++) {
+        bucket[k] = (ulong *)new ulong[(ulong)1 << k];
+        for (i=0; i<((ulong)1 << k); i++)
+            bucket[k][i]=0;
+    }
+
+   // compute the sizes of buckets
+    for (i=0; i<n; i++) {
+        value = D[i];
+        for (k=0; k<=maxlevel; k++)
+            bucket[k][value >> (maxlevel-k)]++;
+    }
+
+   // cumulate the bucket sizes into boundary locations 
+    for (k=0; k<=maxlevel; k++) {
+        for (i=1; i<((ulong)1 << k); i++)
+            bucket[k][i]+=bucket[k][i-1];
+        for (i=((ulong)1 << k)-1; i>=1; i--)
+            bucket[k][i]=bucket[k][i-1];
+        bucket[k][0]=0;
+    }
+
+   //store the leaf information
+    leaves = new ulong[((ulong)1 << maxlevel)+1];
+    for (k=0; k<(ulong)1<<maxlevel; k++)
+        leaves[k] = bucket[maxlevel][k];
+    leaves[k]=n;
+
+   // compute wtree using the information about boundaries 
+    for (i=0; i<n; i++) {
+        value = D[i];
+        for (k=0; k<maxlevel; k++)
+            Tools::SetField(wtree,1,k*n+(bucket[k][value >> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1);
+    }
+    delete [] D;
+    // deallocate boundaries
+    for (k=0; k<=maxlevel; k++)
+        delete[] bucket[k];
+    delete[] bucket;
+   
+    bitrank = new GapEncode(wtree, n*maxlevel, true);
+    prevRunLen = 0;
+    prevChar = 0;
+    prevPos = 0;
+}
+
+RLWaveletTree::~RLWaveletTree() {
+    delete charMap;
+    delete [] leaves;
+    delete bitrank;
+}
+
+ulong RLWaveletTree::rank(ulong c, ulong i) {
+//    if (i >= n) FIXME Replace with assert?
+//        std::cout << "RLWT::rank parameter i out of range" << std::endl;
+   ulong k;
+   ulong j=i;
+   ulong left = 0, right = (ulong)1 << maxlevel;
+   ulong before=0;
+   c = charMap->rank(c)-1;
+   
+   // first round separately to avoid extra "if"  
+   if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right
+      j = bitrank->rank(j)-1;
+      left = right>>1; 
+   }
+   else { // go to the left
+      j -= bitrank->rank(j);
+      right = right>>1;
+   }
+   // then the next without the extra if
+   for (k=1;k<maxlevel;k++) {
+      before = bitrank->rank(n*k+leaves[left]-1);
+      if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right
+         j = bitrank->rank(n*k+leaves[left]+j)-before-1;
+         left = (right+left)>>1; 
+      }
+      else { // go to the left
+         j -= bitrank->rank(n*k+leaves[left]+j)-before;
+        right = (right+left)>>1;
+      }
+      // can never happen: if (left > right) return 0;
+   }
+   return j+1;   
+}
+
+ulong RLWaveletTree::select(ulong c, ulong j) {
+    c = charMap->rank(c)-1;
+   ulong left=c, right=c+1;
+   ulong k, i=j-1, before;
+   for (k=maxlevel-1; k>0;--k) {
+       if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent
+          left -= right-left;
+         before = bitrank->rank(n*k+leaves[left]-1);
+         i = bitrank->select(before+i+1)-n*k-leaves[left];
+       }
+       else { // left side of parent
+          right +=right-left;
+          before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1);
+         i = bitrank->select0(before+i+1)-n*k-leaves[left];
+       }                         
+   }
+   // last level separately to avoid one extra if
+   if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent
+      return bitrank->select(i+1);
+   else // left side of parent
+      return bitrank->select0(i+1);
+}
+
+bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) {
+   return charAtPos(i)==c;
+}
+
+
+ulong RLWaveletTree::charAtPos(ulong i) 
+{
+//    if (i >= n) // FIXME Replace with assert?
+//        std::cout << "RLWT::charAtPos parameter i out of range" << std::endl;
+    ulong k;
+    ulong j=i;
+    ulong left = 0, right = (ulong)1 << maxlevel;
+    ulong before=0;
+    ulong c = 0;
+    ulong rank_tmp = 0;
+    // first round separately to avoid extra "if"  
+    if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right
+        c = c | ((ulong)1 << (maxlevel-1));
+        j = rank_tmp-1; //bitrank->rank(j)-1;
+        left = right>>1; 
+    }
+    else { // go to the left
+        // c = c
+        j -= rank_tmp; // bitrank->rank(j);
+        right = right>>1;
+    }
+    // then the next without the extra if
+    for (k=1;k<maxlevel;k++) {
+        before = bitrank->rank(n*k+leaves[left]-1);
+        if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
+            c = c | ((ulong)1 << (maxlevel-k-1));
+            j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+            left = (right+left)>>1; 
+        }
+        else { // go to the left
+            // c = c
+            j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+            right = (right+left)>>1;
+        }
+        // can never happen: if (left > right) return 0;
+    }
+    return charMap->select(c+1);         
+}
+
+/* returns also the rank-1 of char at given position i */
+ulong RLWaveletTree::charAtPos(ulong i, ulong *rank) 
+{
+//    if (i >= n) // FIXME Replace with assert?
+//        std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
+    ulong k;
+    ulong j=i;
+    ulong left = 0, right = (ulong)1 << maxlevel;
+    ulong before=0;
+    ulong c = 0;
+    ulong rank_tmp = 0;
+    // first round separately to avoid extra "if"  
+    if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ?
+        c = c | ((ulong)1 << (maxlevel-1));
+        j = rank_tmp-1; //bitrank->rank(j)-1;
+        left = right>>1; 
+    }
+    else { // go to the left
+        // c = c
+        j -= rank_tmp; //bitrank->rank(j);
+        right = right>>1;
+    }
+    // then the next without the extra if
+    for (k=1;k<maxlevel;k++) {
+        before = bitrank->rank(n*k+leaves[left]-1);
+        if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
+            c = c | ((ulong)1 << (maxlevel-k-1));
+            j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+            left = (right+left)>>1; 
+        }
+        else { // go to the left
+            // c = c
+            j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+            right = (right+left)>>1;
+        }
+        // can never happen: if (left > right) return 0;
+    }
+    *rank = j;
+    return charMap->select(c+1);         
+}
+
+/* returns also the rank-1 of char at given position i
+ * and trailing char-run length */
+ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank) 
+{
+    // Check for buffered data from previous call
+    if (i > prevPos && i <= prevPos + prevRunLen)
+    {
+        *rank = prevRank + (i-prevPos);
+        return prevChar;
+    }
+
+    prevPos = i;
+//    if (i >= n) // FIXME Replace with assert?
+//        std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
+    ulong k;
+    ulong j=i;
+    ulong left = 0, right = (ulong)1 << maxlevel;
+    ulong before=0;
+    ulong c = 0;
+    ulong rank_tmp = 0;
+    ulong runl_tmp = 0;
+    // first round separately to avoid extra "if"  
+    if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) {
+        if (j+prevRunLen > n)
+            prevRunLen = n-j-1;
+        c = c | ((ulong)1 << (maxlevel-1));
+        j = rank_tmp-1; //bitrank->rank(j)-1;
+        left = right>>1; 
+    }
+    else { // go to the left
+        // c = c
+        if (j+prevRunLen > n)
+            prevRunLen = n-j-1;
+        j -= rank_tmp; //bitrank->rank(j);
+        right = right>>1;
+    }
+
+    // then the next without the extra if
+    for (k=1;k<maxlevel;k++) {
+        before = bitrank->rank(n*k+leaves[left]-1);
+        if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right
+            if (leaves[left]+j+runl_tmp > leaves[right])
+                runl_tmp = leaves[right] - (leaves[left]+j) - 1;
+            if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
+                prevRunLen = runl_tmp;
+            c = c | ((ulong)1 << (maxlevel-k-1));
+            j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+            left = (right+left)>>1; 
+        }
+        else { // go to the left
+            if (leaves[left]+j+runl_tmp > leaves[right])
+                runl_tmp = leaves[right] - (leaves[left]+j) - 1;
+            if (prevRunLen > runl_tmp)  // FIXME Better way to keep track of a run?
+                prevRunLen = runl_tmp;
+            // c = c
+            j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+            right = (right+left)>>1;
+        }
+        // can never happen: if (left > right) return 0;
+    }
+    *rank = j;
+    prevRank = j;
+    prevChar = charMap->select(c+1);
+    return charMap->select(c+1);         
+}
+
+/**
+ * Saving data fields:
+       ulong n;
+       ulong maxlevel;
+       ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
+        GapEncode *bitrank;
+        BitRank *charMap;
+*/
+void RLWaveletTree::Save(FILE *file) const
+{
+    if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Save(): file write error (n).");
+    if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel).");
+
+    for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
+    {
+        if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves).");
+    }
+
+    bitrank->Save(file);
+
+    // Silly
+    ulong *chars = new ulong[256/W+1];
+    // Set up flags:
+    for (ulong i = 0; i < 256; ++i)
+        Tools::SetField(chars, 1, i, charMap->IsBitSet(i));
+
+    for (ulong offset = 0; offset < 256/W+1; ++offset)
+    {
+        if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Save(): file write error (chars).");
+    }
+
+    delete [] chars;
+}
+
+/**
+ * Load from file
+ */
+RLWaveletTree::RLWaveletTree(FILE *file)
+{
+    if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Load(): file read error (n).");
+    if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel).");
+
+    for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
+    {
+        if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves).");
+    }
+
+    bitrank = new GapEncode(file);
+
+    // Silly
+    ulong *chars = new ulong[256/W+1];
+    // Read flags
+    for (ulong offset = 0; offset < 256/W+1; ++offset)
+    {
+        if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Load(): file read error (chars).");
+    }
+    charMap = new BitRank(chars, 256, true);
+
+    prevRunLen = 0; // Init "buffer" values
+    prevChar = 0;
+    prevPos = 0;
+}