Update: libcds WT, empty texts in bitv.

author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Sat, 7 Feb 2009 17:00:01 +0000 (17:00 +0000)

committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Sat, 7 Feb 2009 17:00:01 +0000 (17:00 +0000)
author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 7 Feb 2009 17:00:01 +0000 (17:00 +0000)
committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Sat, 7 Feb 2009 17:00:01 +0000 (17:00 +0000)
diff --git a/BSGAP.cpp b/BSGAP.cpp

new file mode 100644 (file)

index 0000000..b2db287
--- /dev/null
+++ b/BSGAP.cpp
@@ -0,0 +1,779 @@
+/**
+ * Binary-searchable gap encoding scheme (BSGAP)
+ * Niko Välimäki
+ *
+ * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp
+ *
+ * Based on:
+ *
+ * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
+ * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
+ */
+
+#include "BSGAP.h"
+#include <stdexcept>
+#include <cassert>
+using namespace std;
+
+
+const uchar BSGAP::bit_table[] = {
+    0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,
+    1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,
+    2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,
+    1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,
+    3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,
+    1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,
+    2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1};
+
+/**
+ * samplerate == number of keys in each gap encoded binary search tree 
+ * default value is log^2 n where n is the number of keys.
+ */
+BSGAP::BSGAP(ulong *B, ulong u, bool freeB, ulong sampleRate)
+{
+    this->u = u;
+    this->n = 0;
+    this->P = 0;
+    this->offsetA = 0;
+    this->firstKeyA = 0;
+    this->topCount = 0; 
+
+    // Count keys
+    for (ulong i = 0; i < u; i ++)
+        if (Tools::GetField(B, 1, i))
+            this->n ++;
+
+    if (n == 0) // Sanity check
+    {
+        if (freeB && B)
+            delete [] B;
+
+        return;
+    }
+
+    unsigned log2n = Tools::FloorLog2(this->n);
+    if (log2n < 2)
+        log2n = 2;
+    if (sampleRate == 0)
+        this->b = log2n * log2n;
+    else
+        this->b = sampleRate;
+
+    ulong firstKey = 0, lastKey;
+    // Find the first key
+    while (!Tools::GetField(B, 1, firstKey))
+        firstKey ++;
+    
+    // Temp array of BSGAP structures
+    ulong **tempP = new ulong * [n / b + 1];
+    ulong *bitsInBSD = new ulong [n / b + 1];
+    ulong totalBits = 0;
+    this->firstKeyA = new BlockArray(n/b + 1, Tools::CeilLog2(this->u));
+    
+    for (ulong i = 0; i < n/b; i ++)
+    {
+        // Find the last key
+        lastKey = firstKey;
+        for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++)
+            if (Tools::GetField(B, 1, lastKey))
+                k ++;
+        lastKey --;
+        // Sanity check
+        if (lastKey > u)
+        {
+            cout << "error: lastKey = " << lastKey << ", u = " << u << endl;
+            exit(0);
+        }
+
+        // lastKey of the last substructure is this->u
+        if (i == n/b - 1 && n%b == 0)
+            lastKey = u;
+        
+        tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]); // FIXME: left of right subtree?
+        totalBits += bitsInBSD[i];
+        (*firstKeyA)[i] = firstKey;
+
+        firstKey = lastKey;
+    }
+    
+    if (n - (n/b) * b != 0)
+    {
+        // Find the first key
+        while (!Tools::GetField(B, 1, firstKey))
+            firstKey ++;
+        
+        tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]); // FIXME: left of right subtree?
+        totalBits += bitsInBSD[n/b];
+        (*firstKeyA)[n/b] = firstKey;
+    }
+
+    // Number of nodes:
+    this->topCount = n%b ? n/b + 1: n/b;
+
+    // Catenate binary trees into one bitvector
+    this->P = new ulong[totalBits/W + 1];
+    this->bitsInP = totalBits;
+    ulong offset = 0;
+    for (ulong i = 0; i < this->topCount ; ++i)
+    {
+        BitCopy(P, tempP[i], offset, bitsInBSD[i]);
+        delete [] tempP[i];
+        offset += bitsInBSD[i];
+    }
+    delete [] tempP;
+
+    // Pointers to bit vector P  (binary tree starting offsets)
+    // FIXME Use more succinct representation?
+    offset = 0;
+    this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits));
+    for (ulong i = 0; i < this->topCount ; ++i)
+    {
+        (*offsetA)[i] = offset;
+        offset += bitsInBSD[i];
+    }
+    delete [] bitsInBSD;
+
+    if (freeB)
+        delete [] B;
+}
+
+BSGAP::~BSGAP()
+{
+        delete [] P;
+        delete offsetA;
+        delete firstKeyA;
+}
+
+ulong BSGAP::rank(ulong i)
+{
+    if (n == 0) // Trivial case
+        return 0;
+    if (i >= u)
+        throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
+    if ((*firstKeyA)[0] > i)
+        return 0;
+    ulong l = 0, r = topCount - 1;
+    while (l < r)   // Binary search on the array firstKeyA
+    {
+        ulong mid = l + (r - l)/2lu;
+        if ((*firstKeyA)[mid] < i)
+            l = mid + 1;
+        else
+            r = mid;
+    }
+    if ((*firstKeyA)[l] > i && l == 0)
+        return 0;
+    if ((*firstKeyA)[l] == i)
+        return 1 + l * b;
+    if ((*firstKeyA)[l] > i)
+        l --;
+    
+    ulong lastKey, firstKey = (*firstKeyA)[l];
+    if (l == topCount - 1)
+        lastKey = u;
+    else
+        lastKey = (*firstKeyA)[l + 1];
+    ulong nSub = b;
+    if (l == topCount - 1 && n%b)
+        nSub = n%b;
+
+    ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
+    // Add rank of substructures P[0], P[1], ..., P[j - 2]
+    return l * b + k;
+}
+
+ulong BSGAP::rank0(ulong i)
+{
+    return i - rank(i) + 1;
+}
+
+bool BSGAP::IsBitSet(ulong i)
+{
+    if (n == 0) // Trivial case
+        return false;
+    if (i >= u)
+        return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
+    if ((*firstKeyA)[0] > i)
+        return false;
+    ulong l = 0, r = topCount - 1;
+    while (l < r)   // Binary search on the array firstKeyA
+    {
+        ulong mid = l + (r - l)/2lu;
+        if ((*firstKeyA)[mid] < i)
+            l = mid + 1;
+        else
+            r = mid;
+    }
+    if ((*firstKeyA)[l] > i && l == 0)
+        return false;
+    if ((*firstKeyA)[l] == i)
+        return true;
+    if ((*firstKeyA)[l] > i)
+        l --;
+    
+    ulong lastKey, firstKey = (*firstKeyA)[l];
+    if (l == topCount - 1)
+        lastKey = u;
+    else
+        lastKey = (*firstKeyA)[l + 1];
+    ulong nSub = b;
+    if (l == topCount - 1 && n%b)
+        nSub = n%b;
+
+    return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
+}
+
+/**
+ * Returns also rank_1(i) via the second parameter.
+ */
+bool BSGAP::IsBitSet(ulong i, ulong *resultRank)
+{
+    *resultRank = 0;
+    if (n == 0) // Trivial case
+        return false;
+    if (i >= u)
+    {
+        *resultRank = n;
+        return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
+    }
+    if ((*firstKeyA)[0] > i)
+        return false;
+    ulong l = 0, r = topCount - 1;
+    while (l < r)   // Binary search on the array firstKeyA
+    {
+        ulong mid = l + (r - l)/2lu;
+        if ((*firstKeyA)[mid] < i)
+            l = mid + 1;
+        else
+            r = mid;
+    }
+    if ((*firstKeyA)[l] > i && l == 0)
+        return false;
+    if ((*firstKeyA)[l] == i)
+    {
+        *resultRank =  1 + l * b;
+        return true;
+    }
+    if ((*firstKeyA)[l] > i)
+        l --;
+    
+    ulong lastKey, firstKey = (*firstKeyA)[l];
+    if (l == topCount - 1)
+        lastKey = u;
+    else
+        lastKey = (*firstKeyA)[l + 1];
+    ulong nSub = b;
+    if (l == topCount - 1 && n%b)
+        nSub = n%b;
+
+    bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank);
+    *resultRank += l * b;
+    return result;
+}
+
+
+ulong BSGAP::select(ulong i)
+{
+    if (n == 0)
+        return 0;
+    if (i == 0)
+        return 0;
+    if (i > n)
+        throw std::out_of_range("BSGAP::select(): Parameter i was out of range");
+
+    ulong j = i/b;
+    if (i%b == 0)
+        j --;
+    ulong lastKey, firstKey = (*firstKeyA)[j];
+    if (j == topCount - 1)
+        lastKey = u;
+    else
+        lastKey = (*firstKeyA)[j + 1];
+    ulong nSub = b;
+    if (j == topCount - 1 && n%b)
+        nSub = n%b;
+
+    return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true);
+}
+
+ulong BSGAP::select0(ulong i)
+{
+    if (n == 0) // Trivial case
+        return i-1;
+    if (i > u - n)
+        throw std::out_of_range("BSGAP::select0(): Parameter i was out of range");
+
+    ulong l = 0, r = topCount - 1;
+    while (l < r)   // Binary search on the array firstKeyA
+    {
+        ulong mid = l + (r - l)/2lu;
+        if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid]
+            l = mid + 1;
+        else
+            r = mid;
+    }
+    if (l == 0 && (*firstKeyA)[l] >= i)
+        return i-1;
+    if (l > 0 && (*firstKeyA)[l] - l*b >= i)
+        --l; // FIXME: TEST
+    
+    i -= (*firstKeyA)[l] - l*b;
+   
+    ulong lastKey, firstKey = (*firstKeyA)[l];
+    if (l == topCount - 1)
+        lastKey = u;
+    else
+        lastKey = (*firstKeyA)[l + 1];
+    ulong nSub = b;
+    if (l == topCount - 1 && n%b)
+        nSub = n%b;
+
+    return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
+}
+
+ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+    while (1)
+    {
+        if (n == 0)
+            return l+i;
+    
+        // Check if subtree is full  --- FIXME Make this test an assert()!
+        if (IsSubtreeFull(firstKey, l, r, n))
+            throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i);
+        
+        // Parse key value
+        ulong key;
+        ulong x = DeltaDecode(B, offset);
+        if (leftChild)
+            key = r - x;
+        else
+            key = l + x;
+
+        if (numberOfZeros(firstKey, l, key, n) < i)
+        {
+            offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+            i = i - numberOfZeros(firstKey, l, key, n);
+            l = key;
+            n = n - n/2 - 1;
+            leftChild = false;
+        }
+        else
+        {
+            offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+            r = key;
+            n = n/2;
+            leftChild = true;
+        }
+    }
+}
+
+
+ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+    // Number of keys in subtrees on left-side
+    ulong result = 0;
+    
+    while (1)
+    {
+
+        if (n == 0)
+            return result;
+    
+        // Check if subtree is full
+        if (IsSubtreeFull(firstKey, l, r, n))
+            return result + (l == firstKey ? i - l + 1 : i - l);
+    
+        // Parse key value
+        ulong key;
+        ulong x = DeltaDecode(B, offset);
+        if (leftChild)
+            key = r - x;
+        else
+            key = l + x;
+        if (key == i)
+            return result + n/2 + 1;
+
+        if (key < i)
+        {
+            offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+            result += n/2 + 1;
+            l = key;
+            n = n - n/2 - 1;
+            leftChild = false;
+        }
+        else
+        {
+            offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+            r = key;
+            n = n/2; 
+            leftChild = true;
+        }
+    }
+}
+
+
+bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+    // Number of keys in subtrees on left-side
+    ulong result = 0;
+    
+    while (1)
+    {
+
+        if (n == 0)
+            return false;
+    
+        // Check if subtree is full
+        if (IsSubtreeFull(firstKey, l, r, n))
+            return result + (l == firstKey ? i - l + 1 : i - l);
+    
+        // Parse key value
+        ulong key;
+        ulong x = DeltaDecode(B, offset);
+        if (leftChild)
+            key = r - x;
+        else
+            key = l + x;
+        if (key == i)
+            return true;
+
+        if (key < i)
+        {
+            offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+            result += n/2 + 1;
+            l = key;
+            n = n - n/2 - 1;
+            leftChild = false;
+        }
+        else
+        {
+            offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+            r = key;
+            n = n/2; 
+            leftChild = true;
+        }
+    }
+}
+
+// Returns also the rank of i
+bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank)
+{
+    // Number of keys in subtrees on left-side
+    ulong result = 0;
+    
+    while (1)
+    {
+
+        if (n == 0)
+        {
+            *resultRank = result;
+            return false;
+        }
+    
+        // Check if subtree is full
+        if (IsSubtreeFull(firstKey, l, r, n))
+        {
+            *resultRank = result + (l == firstKey ? i - l + 1 : i - l);
+            return true;
+        }
+    
+        // Parse key value
+        ulong key;
+        ulong x = DeltaDecode(B, offset);
+        if (leftChild)
+            key = r - x;
+        else
+            key = l + x;
+        if (key == i)
+        {
+            *resultRank = result + n/2 + 1;
+            return true;
+        }
+
+        if (key < i)
+        {
+            offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+            result += n/2 + 1;
+            l = key;
+            n = n - n/2 - 1;
+            leftChild = false;
+        }
+        else
+        {
+            offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+            r = key;
+            n = n/2; 
+            leftChild = true;
+        }
+    }
+}
+
+
+ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+    while (1)
+    {
+        if (n == 0)
+            return 0;
+    
+        // Check if subtree is full
+        if (IsSubtreeFull(firstKey, l, r, n))
+            return (l == firstKey ? l + i - 1: l + i);
+        
+        // Parse key value
+        ulong key;
+        ulong x = DeltaDecode(B, offset);
+        if (leftChild)
+            key = r - x;
+        else
+            key = l + x;
+        if (n/2 + 1 == i)
+            return key;
+        if (n/2 + 1 < i)
+        {
+            offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+            i = i - n/2 - 1;
+            l = key;
+            n = n - n/2 - 1;
+            leftChild = false;
+        }
+        else
+        {
+            offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+            r =  key;
+            n = n/2;
+            leftChild = true;
+        }
+    }
+}
+
+ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits)
+{
+    bits = 0;
+    if (n == 0)
+        return 0;
+
+    // Check if subtree is full
+    if (IsSubtreeFull(firstKey, l, r, n))
+        return 0;
+
+    // Pick the median
+    ulong key = FindMedian(B, firstKey, l, r, n);
+
+    ulong *leftSub, *rightSub, leftBits, rightBits;
+    leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits);
+    rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits);
+
+    // Encode key value
+    ulong *keyDelta, keyBits;
+    if (leftChild)
+        keyDelta = DeltaEncode(r - key, keyBits, true);
+    else
+        keyDelta = DeltaEncode(key - l, keyBits, true);
+
+    // Encode jump offset if left and right subtrees exists
+    ulong *jumpOffset = 0, jumpBits = 0;
+    if (leftBits != 0 && rightBits != 0)
+        jumpOffset = DeltaEncode(leftBits, jumpBits);
+
+    // bits is the sum of keyBits, jumpBits, leftBits and rightBits 
+    bits = keyBits + jumpBits + leftBits + rightBits;
+    ulong *output = new ulong[bits / W + 1];
+    
+    /**
+     * Write "output"
+     */
+    BitCopy(output, keyDelta, 0, keyBits);
+    delete [] keyDelta;
+    ulong offset = keyBits;
+
+    if (jumpBits != 0)
+    {
+        BitCopy(output, jumpOffset, offset, jumpBits);
+        offset += jumpBits;
+        delete [] jumpOffset;
+    }
+    
+    BitCopy(output, leftSub, offset, leftBits);
+    offset += leftBits;
+    BitCopy(output, rightSub, offset, rightBits);
+    offset += rightBits;
+    
+    assert(offset == bits);
+    if (leftBits != 0)
+        delete [] leftSub;
+    if (rightBits != 0)
+        delete [] rightSub;
+    return output;
+}
+
+void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len)
+{
+    if (len == 0)
+        return;
+    ulong i;
+    
+    if (len <= W)
+    {
+        Tools::SetVariableField(dest, len, offset, *src);
+        return;
+    }
+
+    for (i = 0; i < len/W; i ++)
+        Tools::SetVariableField(dest, W, offset + i * W, src[i]);
+    
+    if (i * W < len)
+        Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]);
+}
+
+ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n)
+{
+    // Linear scan: slow but affects only the construction time
+    n = n/2 + 1;
+    if (l != firstKey)
+        l ++;   // Skip left boundary
+    
+    while (n && l <= r)
+    {
+        if (Tools::GetField(B, 1, l))
+            n --;
+        l ++;
+    }
+    return l - 1;
+}
+
+ulong BSGAP::DeltaDecode(ulong *B, ulong &offset)
+{
+    // Dense coding
+    // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
+    const unsigned s = 180; // FIXME These should be class variables
+    const unsigned c = 256 - s; // FIXME Choose <s,c> values!
+    const unsigned b = 8; // one byte.
+
+    ulong i = 0;
+    ulong base = 0;
+    ulong tot = s;
+    while (Tools::GetVariableField(B, b, offset) < c)
+    {
+        i = i*c + Tools::GetVariableField(B, b, offset);
+        offset += b;
+        base += tot;
+        tot = tot * c;
+    }
+    i = i * s + (Tools::GetVariableField(B, b, offset) - c);
+    offset += b;
+    return i + base;
+}
+
+
+ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative)
+{
+    // Dense coding
+    // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
+    const unsigned s = 180;  // FIXME These should be class variables
+    const unsigned c = 256 - s; // FIXME Choose <s,c> values!
+    const unsigned b = 8; // one byte
+
+    // Calculate size first:
+    bits = b;
+    ulong x = value / s;
+    while (x > 0)
+    {
+        --x;
+        bits += b;
+        x = x/c;
+    }
+
+    // bits is now the length of the whole codeword
+    ulong *B = new ulong[bits/W + 1];
+    
+    // output codeword (backwards):
+    unsigned offset = bits - b;
+    ulong output = c + (value % s);
+    Tools::SetVariableField(B, b, offset, output);
+
+    x = value / s;
+    while (x > 0)
+    {
+        --x;
+        offset -= b;
+
+        output = x % c;
+        Tools::SetVariableField(B, b, offset, output);
+
+        x = x/c;
+    }
+                            
+    return B;
+}
+
+
+/**
+ * Saving the following data fields:
+ *   ulong u, n;             // Universe size, number of 1-bits in B
+ *   ulong topCount;   // Top structure and the number of substructures
+ *   ulong bitsInP;
+ *   ulong *P;               // Pointer to BSGAP structures
+ *   unsigned b;             // Subdictionary size (\log^2 n)
+ *   BlockArray *offsetA;    // Array of pointers (into bitvector P)
+ *   BlockArray *firstKeyA;  // Array of first key positions of the substructures
+ */
+void BSGAP::Save(FILE *file) const
+{
+    if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Save(): file write error (u).");
+
+    if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Save(): file write error (n).");
+
+    if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Save(): file write error (topCount).");
+
+    if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Save(): file write error (bitsInP).");
+    
+    for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
+    {
+        if (std::fwrite(this->P+offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BSGAP::Save(): file write error (P).");
+    }
+
+    if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Save(): file write error (b).");
+
+    offsetA->Save(file);
+    firstKeyA->Save(file);
+}
+
+/**
+ * Load from file
+ */
+BSGAP::BSGAP(FILE *file)
+{
+    if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Load(): file read error (u).");
+
+    if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Load(): file read error (n).");
+
+    if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Load(): file read error (topCount).");
+
+    if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Load(): file read error (bitsInP).");
+
+    P = new ulong[bitsInP/W+1];
+    for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
+    {
+        if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BSGAP::Load(): file read error (P).");
+    }
+
+    if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1)
+        throw std::runtime_error("BSGAP::Load(): file read error (b).");
+
+    offsetA = new BlockArray(file);
+    firstKeyA = new BlockArray(file);
+}
+
diff --git a/BSGAP.h b/BSGAP.h

new file mode 100644 (file)

index 0000000..a1fba5d
--- /dev/null
+++ b/BSGAP.h
@@ -0,0 +1,119 @@
+/**
+ * Binary-searchable gap encoding scheme (BSGAP)
+ * Niko Välimäki
+ *
+ * Based on:
+ *
+ * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
+ * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
+ */
+#ifndef _BSGAP_H_
+#define _BSGAP_H_
+
+#include "Tools.h"
+#include "BlockArray.h"
+
+class BSGAP
+{
+public:
+    BSGAP(ulong *, ulong, bool = false, ulong = 0);
+    BSGAP(FILE *);
+    ~BSGAP();
+    ulong rank(ulong);
+    ulong rank0(ulong);
+    ulong rank(bool b, ulong i) {return b?rank(i):i+1-rank(i);};
+    ulong select(ulong);
+    ulong select0(ulong);
+    ulong select(bool b, ulong i) {return b?select(i):select0(i);};
+    
+    bool IsBitSet(ulong pos);
+    // Returns also the rank_1(pos) via the second param
+    bool IsBitSet(ulong pos, ulong *);
+    
+    ulong size() {return u;};
+    ulong bitsSet() {return n;}
+    void Save(FILE *) const;
+private:
+    static const uchar bit_table[];
+    ulong u, n;             // Universe size, number of 1-bits in B
+    ulong topCount;   // Top structure and the number of substructures
+    ulong bitsInP;          // Length of bit vector P
+    ulong *P;               // Pointer to BSGAP structures
+    unsigned b;             // Subdictionary size (\log^2 n)
+    BlockArray *offsetA;    // Array of pointers (into bitvector P)
+    BlockArray *firstKeyA;  // Array of first key positions of the substructures
+    
+    ulong rank(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+    bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+    bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool, ulong *);
+    ulong select(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+    ulong select0(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+    ulong * GetSubtree(ulong *, ulong, ulong, ulong, ulong, bool, ulong &);
+    ulong DeltaDecode(ulong *, ulong &);
+    ulong DeltaDecode(ulong *, ulong &, bool &);
+    ulong * DeltaEncode(ulong, ulong &, bool = true, bool = false);
+    void BitCopy(ulong *, ulong *, ulong, ulong);
+    ulong FindMedian(ulong *, ulong, ulong, ulong, ulong);
+    unsigned FindLowestBit(ulong);
+
+    inline bool IsSubtreeFull(ulong firstKey, ulong l, ulong r, ulong n)
+    {
+        if (l == firstKey)
+            n --;
+        
+        if (r - l - 1 == n)
+            return true;
+        return false;
+    }
+    /**
+     * Returns the number of zeros in the left subtree
+     */
+    inline ulong numberOfZeros(ulong firstKey, ulong l, ulong key, ulong n)
+    {
+        ulong z = key - l - 1 - n/2;
+        if (l == firstKey)
+            ++z;
+        if (z == -1lu)
+            z = 0;
+        return z;
+    }
+
+    inline ulong FindRightSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n)
+    {
+        if (n - n/2 - 1 == 0)
+            return 0;       // Does not exists
+    
+        if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1))
+            return 0;       // Right subtree is full (does not exists)
+    
+        if (n/2 == 0)
+            return offset; // Left subtree does not exists
+    
+        if (IsSubtreeFull(firstKey, l, key, n/2))
+            return offset; // Left subtree is full (does not exists)
+    
+        ulong len = DeltaDecode(B, offset);
+        return offset + len;         // Skipping left subtree
+    }
+
+    inline ulong FindLeftSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n)
+    {
+        if (n/2 == 0)
+            return 0;       // Does not exists
+    
+        if (IsSubtreeFull(firstKey, l, key, n/2))
+            return 0;       // Left subtree is full
+    
+        if (n - n/2 - 1 == 0)
+            return offset; // Right subtree does not exists
+    
+        if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1))
+            return offset; // Right subtree is full
+    
+        DeltaDecode(B, offset); // Calculate new offset
+        return offset;      // Left subtree is the first one
+    }
+    
+};
+
+#endif
diff --git a/BitRank.cpp b/BitRank.cpp

index 61dcc42..bccd82c 100644 (file)
--- a/BitRank.cpp
+++ b/BitRank.cpp
@@ -92,8 +92,6 @@ BitRank::BitRank(ulong *bitarray, ulong n, bool owner) {
      data=bitarray;
      this->owner = owner;
      this->n=n;  // length of bitarray in bits
-    b = W; // b is a word
-    s=b*superFactor;
      ulong aux=(n+1)%W;
      if (aux != 0)
          integers = (n+1)/W+1;
@@ -103,7 +101,7 @@ BitRank::BitRank(ulong *bitarray, ulong n, bool owner) {
  }
  
  BitRank::~BitRank() {
-    delete [] Rs;
+    delete Rs;
      delete [] Rb;
      if (owner) delete [] data;
  }
@@ -112,16 +110,16 @@ BitRank::~BitRank() {
  void BitRank::BuildRank()
  {
      ulong num_sblock = n/s;
-    ulong num_block = n/b;
-    Rs = new ulong[num_sblock+1];//+1 we add the 0 pos
+    ulong num_block = n/W;
+    Rs = new BlockArray(num_sblock+1, Tools::CeilLog2(n));//+1 we add the 0 pos
      Rb = new uchar[num_block+1];//+1 we add the 0 pos
         
         ulong j;
-    Rs[0] = 0lu;
+    (*Rs)[0] = 0lu;
         
      for (j=1;j<=num_sblock;j++) 
          {
-                       Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1];
+                       (*Rs)[j]=BuildRankSub((j-1)*superFactor,superFactor)+(*Rs)[j-1];
                 }
      
      Rb[0]=0;
@@ -148,7 +146,7 @@ ulong BitRank::BuildRankSub(ulong ini, ulong bloques){
  //this rank ask from 0 to n-1
  ulong BitRank::rank(ulong i) {
      ++i; // the following gives sum of 1s before i 
-    return Rs[i>>8]+Rb[i>>wordShift]
+    return (*Rs)[i>>8]+Rb[i>>wordShift]
          +popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1));
  }
  
@@ -165,14 +163,14 @@ ulong BitRank::select(ulong x) {
          
      ulong l=0, r=n/s;
      ulong mid=(l+r)/2;      
-    ulong rankmid = Rs[mid];
+    ulong rankmid = (*Rs)[mid];
      while (l<=r) {
          if (rankmid<x)
              l = mid+1;
          else
              r = mid-1;
          mid = (l+r)/2;              
-        rankmid = Rs[mid];
+        rankmid = (*Rs)[mid];
      }    
      //sequential search using popcount over a int
      ulong left;
@@ -191,7 +189,7 @@ ulong BitRank::select(ulong x) {
          ones = popcount(j);
      }
      //sequential search using popcount over a char
-    left=left*b; 
+    left=left*W; 
      rankmid = popcount8(j);
      if (rankmid < x) {
          j=j>>8;
@@ -233,14 +231,14 @@ ulong BitRank::select0(ulong x) {
          
      ulong l=0, r=n/s;
      ulong mid=(l+r)/2;
-    ulong rankmid = mid * s - Rs[mid];
+    ulong rankmid = mid * s - (*Rs)[mid];
      while (l<=r) {
          if (rankmid<x)
              l = mid+1;
          else
              r = mid-1;
          mid = (l+r)/2;              
-        rankmid = mid * s - Rs[mid];
+        rankmid = mid * s - (*Rs)[mid];
      }    
      
      //sequential search using popcount over a int
@@ -261,7 +259,7 @@ ulong BitRank::select0(ulong x) {
      }
      
      //sequential search using popcount over a char
-    left=left*b;
+    left=left*W;
      rankmid = 8 - popcount8(j);
      if (rankmid < x) {
          j=j>>8;
@@ -298,3 +296,39 @@ bool BitRank::IsBitSet(ulong i) {
  ulong BitRank::NumberOfBits() {
      return n;
  }
+
+/**
+ * Saving data fields:
+
+    ulong n;
+    ulong *data; //here is the bit-arra
+ */
+void BitRank::Save(FILE *file)
+{
+    if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BitRank::Save(): file write error (n).");
+
+    for (ulong offset = 0; offset < integers; ++offset)
+        if (std::fwrite(this->data + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BitRank::Save(): file write error (data).");
+}
+
+BitRank::BitRank(FILE *file)
+{
+    owner = 1;
+    if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("BitRank::Load(): file read error (n).");
+
+    ulong aux=(n+1)%W;
+    if (aux != 0)
+        integers = (n+1)/W+1;
+    else 
+        integers = (n+1)/W;
+
+    data = new ulong[integers];
+    for (ulong offset = 0; offset < integers; ++offset)
+        if (std::fread(this->data + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BitRank::Load(): file read error (data).");
+
+    BuildRank();
+}
diff --git a/BitRank.h b/BitRank.h

index c7783d2..49d3647 100644 (file)
--- a/BitRank.h
+++ b/BitRank.h
@@ -21,6 +21,7 @@
  #ifndef _BITSTREAM_H_
  #define _BITSTREAM_H_
  
+#include "BlockArray.h"
  #include "Tools.h"
  
  class BitRank {
@@ -33,18 +34,20 @@ private:
      static const unsigned wordShift = 6;
      static const unsigned superFactor = 4; // 256 bit blocks
  #endif
+    static const unsigned s = W*superFactor;
      
      ulong *data; //here is the bit-array
      bool owner;
      ulong n,integers;
-    unsigned b,s; 
-    ulong *Rs; //superblock array
+    BlockArray *Rs; //superblock array
      uchar *Rb; //block array
      ulong BuildRankSub(ulong,  ulong); //internal use of BuildRank
      void BuildRank(); //crea indice para rank
  public:
      BitRank(ulong *, ulong, bool);
+    BitRank(FILE *);
      ~BitRank(); //destructor    
+    void Save(FILE *);
      ulong rank(ulong i); //Rank from 0 to n-1
      ulong select(ulong x); // gives the position of the x:th 1.
      ulong select0(ulong x); // gives the position of the x:th 0.
diff --git a/BlockArray.h b/BlockArray.h

index cc60c35..99aff91 100644 (file)
--- a/BlockArray.h
+++ b/BlockArray.h
@@ -1,7 +1,8 @@
  #ifndef _BLOCK_ARRAY_H_
  #define _BLOCK_ARRAY_H_
-#include <iostream>
  #include "Tools.h"
+#include <iostream>
+#include <stdexcept>
  
  class BlockArray
  {
@@ -43,7 +44,44 @@ public:
      ulong spaceInBits() {
          return n*blockLength+W; // plus 4 ulong's
      }
+
+    /**
+     * Saving data fields:
+     *     ulong n;
+     *     ulong blockLength;
+     *     ulong* data;
+     */
+    void Save(FILE *file) const
+    {
+        if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BlockArray::Save(): file write error (n).");
+        if (std::fwrite(&(this->blockLength), sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BlockArray::Save(): file write error (blockLength).");
      
+        for (ulong offset = 0; offset < n*blockLength/W+1; offset ++)
+        {
+            if (std::fwrite(this->data + offset, sizeof(ulong), 1, file) != 1)
+                throw std::runtime_error("BlockArray::Save(): file write error (data).");
+        }
+    }
+
+    /**
+     * Load from file
+     */
+    BlockArray(FILE *file)
+    {
+        if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BlockArray::Load(): file read error (n).");
+        if (std::fread(&(this->blockLength), sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("BlockArray::Load(): file read error (blockLength).");
+
+        data = new ulong[n*blockLength/W+1];
+        for (ulong offset = 0; offset < n*blockLength/W+1; offset ++)
+        {
+            if (std::fread(this->data + offset, sizeof(ulong), 1, file) != 1)
+                throw std::runtime_error("BlockArray::Load(): file read error (data).");
+        }
+    }
  };
  
  #endif
diff --git a/CSA.cpp b/CSA.cpp

index 5719154..a86407d 100644 (file)
--- a/CSA.cpp
+++ b/CSA.cpp
@@ -35,6 +35,8 @@ using std::map;
  
  using SXSI::TextCollection;
  
+// Save file version info
+const uchar CSA::versionFlag = 2;
  
  ////////////////////////////////////////////////////////////////////////////
  // Class CSA::THuffAlphabetRank
@@ -120,6 +122,24 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() {
          delete bitrank;
  }
  
+/**
+ * Saving data fields:
+        BitRank *bitrank;
+        bool leaf;
+        uchar ch;
+        left child;
+        right child;
+*/
+void CSA::THuffAlphabetRank::Save(FILE *file)
+{
+
+}
+
+CSA::THuffAlphabetRank::THuffAlphabetRank(FILE *file)
+{
+
+
+}
  
  /////////////////////////////////////////////f///////////////////////////////
  // Class CSA
@@ -129,9 +149,9 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() {
   * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
   */
  CSA::CSA(unsigned samplerate)
-    : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),  
-      codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0),
-      textLength(0), textStartPos(0)
+    : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0),  
+      codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
+      textLength(0), textStartPos(0), emptyTextRank(0)
  {
      this->samplerate = samplerate;
  
@@ -164,20 +184,38 @@ void CSA::InsertText(uchar const * text)
      if (m > maxTextLength)
          maxTextLength = m; // Store length of the longest text seen so far.
  
-    this->n += m;
-    this->numberOfTexts ++;
-    dynFMI->addText(text, m);
+    if (m > 1)
+    {
+        this->n += m;
+        this->numberOfTexts ++;
+        this->numberOfAllTexts ++;
+        dynFMI->addText(text, m);
+    }
+    else
+    {
+        emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
+        this->numberOfAllTexts ++;            // Replace with dynamic bitvector
+    }
  }
  
  void CSA::MakeStatic()
  {
      // Sanity check:
      if (dynFMI == 0)
+        throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
+
+    // Bitvector of empty texts
      {
-        std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
-        std::exit(0);
+        //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
+        ulong *tempB = new ulong[numberOfAllTexts/W + 1];
+        for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
+            tempB[i] = 0;
+        for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
+            Tools::SetField(tempB, 1, (*it), 1);
+        emptyTextId.clear();
+        emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
      }
- 
+   
      uchar *bwt = dynFMI->getBWT();
      /*printf("123456789012345678901234567890123456789\n");
      for (TextPosition i = 0; i < n; i ++)
@@ -199,24 +237,27 @@ void CSA::MakeStatic()
      delete dynFMI;
      dynFMI = 0;
  
-    makewavelet(bwt);
+    makewavelet(bwt); // Deletes bwt!
+    bwt = 0;
  
      // Calculate BWT end-marker position (of last inserted text)
      // and the length of the first text (counter l):
      ulong i = 0;
      ulong l = 1;
-    while (bwt[i] != '\0')
+
+    uchar c  = alphabetrank->access(i);
+    while (c != '\0')
      {
-        uchar c = bwt[i];
          i = C[c]+alphabetrank->rank(c, i)-1;
          ++l;
+        c = alphabetrank->access(i);
      }
      bwtEndPos = i;
+    assert(bwtEndPos < n);
      //printf("bwtEndPos = %lu\n", bwtEndPos);
  
-    delete [] bwt;
-
      // Build up arrays for text length and starting positions
+    // FIXME Remove, temp data
      textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
      textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
      (*textLength)[0] = l;
@@ -228,43 +269,71 @@ void CSA::MakeStatic()
  
  bool CSA::EmptyText(DocId k) const
  {
-    assert(k < (DocId)numberOfTexts);
-    return (1 == (*textLength)[k]);
+    assert(k < (DocId)numberOfTexts); 
+    if (emptyTextRank->IsBitSet(k))
+        return true;
+    return false;
  }
  
  uchar* CSA::GetText(DocId k) const
  {
      assert(k < (DocId)numberOfTexts);
  
-    uchar* result = new uchar[(*textLength)[k]];
+    ulong textRank = 0;
+    if (emptyTextRank->IsBitSet(k, &textRank))
+    {
+        uchar* result = new uchar[1];
+        result[0] = 0;
+        return result;
+    }
+    // Map to non-empty text
+    k -= textRank; //emptyTextRank->rank(k);
+    
      TextPosition i = k;
-    TextPosition j = (*textLength)[k] - 1;
-    result[j] = '\0';
+    // FIXME
+    string result; //TextPosition j = (*textLength)[k] - 1;
  
-    uchar c = alphabetrank->charAtPos(i);
+    uchar c = alphabetrank->access(i);
      while (c != '\0')
      {
-        --j;
-        result[j] = c;
+        result.push_back(c);
          i = C[c]+alphabetrank->rank(c,i)-1;
          
-        c = alphabetrank->charAtPos(i); // "next" char.
+        c = alphabetrank->access(i); // "next" char.
      }
-    assert(j == 0);
-    return result;
+    
+    // Convert to uchar (FIXME return string?)
+    i = result.size();
+    uchar* res = new uchar[i+1]; 
+    res[i] = '\0';   
+    for (ulong j = 0; j < i; ++j)
+        res[i-j-1] = result[j];
+    return res;
  }
-
+/*
+ * Not supported
  uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
  {
      assert(k < (DocId)numberOfTexts);
      assert(j < (*textLength)[k]);
      assert(i <= j);
  
+    ulong textRank = 0;
+    if (emptyTextRank->IsBitSet(k, &textRank))
+    {
+        uchar* result = new uchar[1];
+        result[0] = 0;
+        return result;
+    }
+    
+    // Map to non-empty text
+    k -= textRank; // emptyTextRank->rank(k);
+
      // Start position of k'th text
      ulong start = (*textStartPos)[k];
  
      return Substring(i + start, j-i+1);
-}
+    }*/
  
  /******************************************************************
   * Existential queries
@@ -286,7 +355,7 @@ bool CSA::IsPrefix(uchar const * pattern) const
  
  bool CSA::IsSuffix(uchar const *pattern) const
  {
-    // Here counting is as fast as isSuffix():
+    // Here counting is as fast as IsSuffix():
      if (CountSuffix(pattern) > 0)
          return true;
      return false;
@@ -295,8 +364,12 @@ bool CSA::IsSuffix(uchar const *pattern) const
  bool CSA::IsEqual(uchar const *pattern) const
  {
      TextPosition m = std::strlen((char *)pattern);
-    //if (m == 0)
-    //    return false; // FIXME Check for empty texts! 
+    if (m == 0)
+    {
+        if (numberOfAllTexts - numberOfTexts > 0u)
+            return true; // Empty texts exists
+        return false; // No empty texts exists
+    }
  
      TextPosition sp = 0, ep = 0;
      // Match including end-marker
@@ -337,7 +410,7 @@ unsigned CSA::CountPrefix(uchar const * pattern) const
  {
      TextPosition m = strlen((char *)pattern);
      if (m == 0)
-        return numberOfTexts;
+        return numberOfAllTexts;
  
      TextPosition sp = 0, ep = 0;
      Search(pattern, m, &sp, &ep);
@@ -350,7 +423,7 @@ unsigned CSA::CountSuffix(uchar const * pattern) const
  {
      TextPosition m = strlen((char *)pattern);
      if (m == 0)
-        return numberOfTexts;
+        return numberOfAllTexts;
  
      TextPosition sp = 0, ep = 0;
      // Search with end-marker
@@ -362,8 +435,8 @@ unsigned CSA::CountSuffix(uchar const * pattern) const
  unsigned CSA::CountEqual(uchar const *pattern) const
  {
      TextPosition m = strlen((char const *)pattern);
-//    if (m == 0)
-//        return ; // FIXME Test for empty texts. 
+    if (m == 0)
+        return numberOfAllTexts - numberOfTexts; // Empty texts. 
  
      TextPosition sp = 0, ep = 0;
      // Match including end-marker
@@ -375,6 +448,10 @@ unsigned CSA::CountEqual(uchar const *pattern) const
  
  unsigned CSA::CountContains(uchar const * pattern) const
  {
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return numberOfAllTexts; // Total number of texts. 
+
      // Here counting is as slow as fetching the result set
      // because we would have to filter out occ's that fall within same document.
      TextCollection::document_result result = Contains(pattern);
@@ -395,7 +472,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const
  {
      TextPosition m = strlen((char *)pattern);
      if (m == 0)
-        return TextCollection::document_result();
+        return TextCollection::document_result(); // FIXME Should return all 1...k
  
      TextPosition sp = 0, ep = 0;
      Search(pattern, m, &sp, &ep);
@@ -407,6 +484,9 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const
      if (resultSize == 0)
          return result;
  
+    result.reserve(resultSize); // Try to avoid reallocation.
+
+    // Iterate through end-markers in [sp,ep]:
      unsigned i = 0;
      if (sp > 0)
          i = alphabetrank->rank(0, sp - 1);
@@ -414,7 +494,10 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const
      {
          // End-marker we found belongs to the "preceeding" doc in the collection
          DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
          result.push_back(docId);
+
          -- resultSize;
          ++ i;
      }
@@ -426,37 +509,46 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const
  {
      TextPosition m = strlen((char *)pattern);
      if (m == 0)
-        return TextCollection::document_result();
+        return TextCollection::document_result(); // FIXME Should return all 1...k
  
      TextPosition sp = 0, ep = 0;
      // Search with end-marker
      Search(pattern, m+1, &sp, &ep);
   
      TextCollection::document_result result;
+    result.reserve(ep-sp+1); // Try to avoid reallocation.
  
+    ulong sampled_rank_i = 0;
      // Check each occurrence
      for (; sp <= ep; ++sp)
      {
          TextPosition i = sp;
-        uchar c = alphabetrank->charAtPos(i);
-        while (c != '\0' && !sampled->IsBitSet(i))
+
+        uchar c = alphabetrank->access(i);
+        while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
          {
-            i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
-            c = alphabetrank->charAtPos(i);
+            i = C[c]+alphabetrank->rank(c,i)-1;
+            c = alphabetrank->access(i);
          }
+        // Assert: c == '\0'  OR  sampled->IsBitSet(i)
+
          if (c == '\0')
          {
              // Rank among the end-markers in BWT
              unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
  
              // End-marker that we found belongs to the "preceeding" doc in collection:
-            DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+            DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;        
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
              result.push_back(docId);
          }
          else // Sampled position
          {
-            TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
-            result.push_back(DocIdAtTextPos(textPos));
+            DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
+            result.push_back(docId);
          }
      }
      
@@ -467,7 +559,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const
  {
      TextPosition m = strlen((char const *)pattern);
      if (m == 0)
-        return TextCollection::document_result();
+        return TextCollection::document_result(); // FIXME Should return all empty texts
  
      TextPosition sp = 0, ep = 0;
      // Match including end-marker
@@ -480,14 +572,19 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const
      if (resultSize == 0)
          return result;
  
+    result.reserve(resultSize); // Try to avoid reallocation.
+
      unsigned i = 0;
      if (sp > 0)
          i = alphabetrank->rank(0, sp - 1);
      while (resultSize)
      {
-       // End-marker we found belongs to the "previous" doc in the collection
+        // End-marker we found belongs to the "previous" doc in the collection
          DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+        // Map to doc ID:
+        docId = emptyTextRank->select0(docId+1);
          result.push_back(docId);
+
          -- resultSize;
          ++ i;
      }
@@ -505,19 +602,20 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const
      TextPosition sp = 0, ep = 0;
      // Search all occurrences
      Search(pattern, m, &sp, &ep);
- 
+
      // We want unique document indentifiers, using std::set to collect them
      std::set<DocId> resultSet;
  
+    ulong sampled_rank_i = 0;
      // Check each occurrence
      for (; sp <= ep; ++sp)
      {
          TextPosition i = sp;
-        uchar c = alphabetrank->charAtPos(i);
-        while (c != '\0' && !sampled->IsBitSet(i))
+        uchar c = alphabetrank->access(i);
+        while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
          {
              i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
-            c = alphabetrank->charAtPos(i);
+            c = alphabetrank->access(i);
          }
          if (c == '\0')
          {
@@ -530,15 +628,17 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const
          }
          else
          {
-            TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
-            resultSet.insert(DocIdAtTextPos(textPos));
+            DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+            assert((unsigned)di < numberOfTexts);
+            resultSet.insert(di);
          }
      }
      
-    // Convert std::set to std::vector (TODO better way to construct result vector?)
-    TextCollection::document_result result;
-    for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
-        result.push_back(*it);
+    // Convert std::set to std::vector
+    TextCollection::document_result result(resultSet.begin(), resultSet.end());
+    // Map to doc ID's
+    for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+        *it = emptyTextRank->select0(*it+1);
      return result;
  }
  
@@ -556,24 +656,26 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const
  {
      TextPosition m = strlen((char *)pattern);
      if (m == 0)
-        return full_result();
+        return full_result(); // FIXME Throw exception?
  
      TextPosition sp = 0, ep = 0;
      // Search all occurrences
      Search(pattern, m, &sp, &ep);
   
      full_result result;
+    result.reserve(ep-sp+1); // Try to avoid reallocation.
  
+    ulong sampled_rank_i = 0;    
      // Report each occurrence
      for (; sp <= ep; ++sp)
      {
          TextPosition i = sp;
          TextPosition dist = 0;
-        uchar c = alphabetrank->charAtPos(i);
-        while (c != '\0' && !sampled->IsBitSet(i))
+        uchar c = alphabetrank->access(i);
+        while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
          {
-            i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
-            c = alphabetrank->charAtPos(i);
+            i = C[c]+alphabetrank->rank(c,i)-1;
+            c = alphabetrank->access(i);
              ++ dist;
          }
          if (c == '\0')
@@ -583,14 +685,19 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const
              
              // End-marker that we found belongs to the "preceeding" doc in collection:
              DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
              result.push_back(make_pair(docId, dist)); 
          }
          else
          {
-            TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist;
-            // Read document identifier and its starting position in text order
-            DocId docId = DocIdAtTextPos(textPos);
-            result.push_back(make_pair(docId, textPos - (*textStartPos)[docId]));
+            TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
+            DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+//            textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
+
+            // Map to doc ID:
+            docId = emptyTextRank->select0(docId+1);
+            result.push_back(make_pair(docId, textPos));
          }
      }
      
@@ -598,106 +705,59 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const
  }
  
  
-/**
- * Finds document identifier for given text position
- *
- * Starting text position of the document is stored into second parameter.
- * Binary searching on text starting positions. 
- */
-TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
-{
-    assert(i < n);
-
-    DocId a = 0;
-    DocId b = numberOfTexts - 1;
-    while (a < b)
-    {
-        DocId c = a + (b - a)/2;
-        if ((*textStartPos)[c] > i)
-            b = c - 1;
-        else if ((*textStartPos)[c+1] > i)
-            return c;
-        else
-            a = c + 1;
-    }
-
-    assert(a < (DocId)numberOfTexts);
-    assert(i > (*textStartPos)[a]);
-    assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
-    return a;
-}
-
-/**
- * Count end-markers in given interval
- */
-unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const
-{
-    if (sp > ep)
-        return 0;
-
-    ulong ranksp = 0;
-    if (sp != 0)
-        ranksp = alphabetrank->rank(0, sp - 1);
-    
-    return alphabetrank->rank(0, ep) - ranksp;
-}
-
  /**
   * Save index to a file handle
   *
   * Throws a std::runtime_error exception on i/o error.
   * First byte that is saved represents the version number of the save file.
- * In version 1 files, the data fields are:
- *        <uchar version> version info;
- *        <unsigned s>    samplerate;
- *        <ulong n>       length of the BWT;
- *        <ulong bwt>     end marker position in BWT;
- *        <uchar *>       BWT string of length n;
- *        <unsigned r>    number of texts;
- *        <ulong max>     length of longest text;
- *        <vector textLength>
- *                        array of <ulong, ulong> pairs.
- * 
- * TODO: Save the data structures instead of BWT sequence?
- * TODO: Don't save textLength and textStartPos arrays.
+ * In version 2 files, the data fields are:
+ *  uchar versionFlag;
+    TextPosition n;
+    unsigned samplerate;
+    unsigned C[256];
+    TextPosition bwtEndPos;
+    static_sequence * alphabetrank;
+    BSGAP *sampled; 
+    BlockArray *suffixes;
+    BlockArray *suffixDocId;
+    unsigned numberOfTexts;
+    unsigned numberOfAllTexts;
+    ulong maxTextLength;
+    BlockArray *endmarkerDocId;
+    BSGAP *emptyTextRank;
   */
  void CSA::Save(FILE *file) const
  {
-    // Saving version 1 data:
-    uchar versionFlag = 1;
+    // Saving version info:
      if (std::fwrite(&versionFlag, 1, 1, file) != 1)
          throw std::runtime_error("CSA::Save(): file write error (version flag).");
  
+    if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
+        throw std::runtime_error("CSA::Save(): file write error (n).");
      if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
          throw std::runtime_error("CSA::Save(): file write error (samplerate).");
  
-    if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
-        throw std::runtime_error("CSA::Save(): file write error (n).");
+    for(ulong i = 0; i < 256; ++i)
+        if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
+            throw std::runtime_error("CSA::Save(): file write error (C table).");
  
      if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
          throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
-    
-    for (ulong offset = 0; offset < n; offset ++)
-    {
-        uchar c = alphabetrank->charAtPos(offset);
-        if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
-            throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
-    }
  
-    unsigned r = numberOfTexts;
-    if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
-        throw std::runtime_error("CSA::Save(): file write error (r).");
+    alphabetrank->save(file);
+    sampled->Save(file);
+    suffixes->Save(file);
+    suffixDocId->Save(file);
  
+    if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+        throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
+    if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
+        throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
      if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
          throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
-    
-    for (r = 0; r < numberOfTexts; ++ r)
-    {
-        if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1)
-            throw std::runtime_error("CSA::Save(): file write error (text length).");
-        if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1)
-            throw std::runtime_error("CSA::Save(): file write error (text start).");
-    }    
+
+    endmarkerDocId->Save(file);
+    emptyTextRank->Save(file);
  }
  
  
@@ -713,70 +773,61 @@ void CSA::Load(FILE *file, unsigned samplerate)
      delete dynFMI;       dynFMI = 0;
      delete alphabetrank; alphabetrank = 0;
      delete sampled;      sampled = 0;
-    delete [] suffixes;  suffixes = 0;
-    delete [] positions; positions = 0;
+    delete suffixes;     suffixes = 0;
+    delete suffixDocId;  suffixDocId = 0;
+    delete positions; positions = 0;
      delete [] codetable; codetable = 0;
  
      delete endmarkerDocId; endmarkerDocId = 0;
+    delete emptyTextRank;  emptyTextRank = 0;
+    // FIXME Remove following:
      delete textLength;     textLength = 0;
      delete textStartPos;   textStartPos = 0;
  
      this->maxTextLength = 0;
      this->numberOfTexts = 0;
+    this->numberOfAllTexts = 0;
      this->samplerate = samplerate;
      this->n = 0;
  
-    uchar versionFlag = 0;
-    if (std::fread(&versionFlag, 1, 1, file) != 1)
+    uchar verFlag = 0;
+    if (std::fread(&verFlag, 1, 1, file) != 1)
          throw std::runtime_error("CSA::Load(): file read error (version flag).");
-    if (versionFlag != 1)
-        throw std::runtime_error("CSA::Load(): invalid start byte.");
+    if (verFlag != CSA::versionFlag)
+        throw std::runtime_error("CSA::Load(): invalid save file version.");
  
+    if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
+        throw std::runtime_error("CSA::Load(): file read error (n).");
      if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
          throw std::runtime_error("CSA::Load(): file read error (samplerate).");
-    if (this->samplerate == 0)
-        this->samplerate = samplerate;
+// FIXME samplerate can not be changed during load.
+//    if (this->samplerate == 0)
+//        this->samplerate = samplerate;
  
-    if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
-        throw std::runtime_error("CSA::Load(): file read error (n).");
+    for(ulong i = 0; i < 256; ++i)
+        if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
+            throw std::runtime_error("CSA::Load(): file read error (C table).");
  
      if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
          throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
  
-    uchar *bwt = new uchar[n];
-    for (ulong offset = 0; offset < n; offset ++)
-        if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
-            throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
-
-    unsigned r = 0;
-    if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
-        throw std::runtime_error("CSA::Load(): file read error (r).");
+    alphabetrank = static_sequence::load(file);
+    sampled = new BSGAP(file);
+    suffixes = new BlockArray(file);
+    suffixDocId = new BlockArray(file);
  
+    if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+        throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
+    if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
+        throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
      if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
          throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
-    
-    // Build up arrays for text length and starting positions
-    textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength));
-    textStartPos = new BlockArray(r, Tools::CeilLog2(this->n));
  
-    while (r > 0)
-    {
-        TextPosition length = 0, start = 0;
-        if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
-            throw std::runtime_error("CSA::Load(): file read error (text length).");
-        if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
-            throw std::runtime_error("CSA::Load(): file read error (text start).");
-
-        (*textLength)[numberOfTexts] = length;
-        (*textStartPos)[numberOfTexts] = start;
-        ++numberOfTexts;
-        --r;
-    }
+    endmarkerDocId = new BlockArray(file);
+    emptyTextRank = new BSGAP(file);
  
-    // Construct data structures
-    makewavelet(bwt);
-    delete [] bwt;
-    maketables();  // FIXME: this will redo text length tables
+    // FIXME Construct data structures with new samplerate
+    //maketables();  // FIXME: this will redo text length tables
  }
  
  
@@ -785,6 +836,8 @@ void CSA::Load(FILE *file, unsigned samplerate)
   * Rest of the functions follow...
   */
  
+/*
+ * Not supported
  uchar * CSA::Substring(TextPosition i, TextPosition l) const
  {
      uchar *result = new uchar[l + 1];
@@ -818,7 +871,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const
      
      for (dist = 0; dist < skip + l; dist++) 
      {
-        int c = alphabetrank->charAtPos(j);
+        int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp);
          if (c == '\0')
          {
              // Rank among the end-markers in BWT
@@ -833,7 +886,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const
      result[l] = 0u;
      return result;
  }
-
+*/
    /*TextPosition CSA::inverse(TextPosition i)
  {
      TextPosition skip = samplerate - i % samplerate;
@@ -858,7 +911,8 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const
      return j;
      }*/
   
-ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const 
+{
      // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
      int c = (int)pattern[m-1]; 
      TextPosition i=m-1;
@@ -877,7 +931,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult,
          return ep - sp + 1;
      else
          return 0;
- }
+}
  
     /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) 
  {
@@ -890,12 +944,13 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult,
          return 0;
          }*/
  
-CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
+CSA::TextPosition CSA::Lookup(TextPosition i) const
  {
      TextPosition dist=0;
      while (!sampled->IsBitSet(i)) 
      {
-        int c = alphabetrank->charAtPos(i);
+        
+        int c = alphabetrank->access(i);
          if (c == '\0')
          {
              // End-markers are sampled
@@ -903,7 +958,7 @@ CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(sample
              DocId docId = (*endmarkerDocId)[endmarkerRank];
              return (*textStartPos)[docId] + dist;
          }
-        i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping  
+        i = C[c]+alphabetrank->rank(c,i)-1;  
          ++dist;
      }
      
@@ -915,12 +970,14 @@ CSA::~CSA() {
      delete alphabetrank;       
      delete sampled;
      delete suffixes;
+    delete suffixDocId;
      delete positions;
      delete [] codetable;
  
      delete endmarkerDocId;
      delete textLength;
      delete textStartPos;
+    delete emptyTextRank;
  }
  
  void CSA::makewavelet(uchar *bwt)
@@ -948,9 +1005,30 @@ void CSA::makewavelet(uchar *bwt)
          C[i]=C[i-1]+prev;
          prev = temp;
      }
-    this->codetable = node::makecodetable(bwt,n);
-    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);   
-    //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");    
+//    this->codetable = node::makecodetable(bwt,n);
+//    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);   
+//    delete [] bwt;
+    //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
+
+    uint *text = new uint[n];
+    for (i = 0; i < n; ++i) // Silly
+        text[i] = bwt[i];
+    delete [] bwt;
+    bwt = 0;
+
+    alphabet_mapper * am = new alphabet_mapper_none();
+    static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(16); // FIXME samplerate?
+    
+    //cout << "Building Huffman table..."; cout.flush();
+    
+    wt_coder * wtc = new wt_coder_huff(text,n,am);
+    
+    //cout << "done" << endl; cout.flush();
+    //cout << "Building static_sequence..."; cout.flush();
+    
+    alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
+    delete [] text;
+    text = 0;
  
  /*    for (i = 0; i < n; ++i)
      {
@@ -966,9 +1044,9 @@ void CSA::maketables()
     
      ulong *sampledpositions = new ulong[n/W+1];
      unsigned ceilLog2n = Tools::CeilLog2(n);
-    suffixes = new BlockArray(sampleLength, ceilLog2n);
      positions = new BlockArray(sampleLength, ceilLog2n);
- 
+    BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
+
      // Mapping from end-markers to doc ID's:
      endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
    
@@ -977,6 +1055,7 @@ void CSA::maketables()
          sampledpositions[i]=0lu;
      
      ulong x,p=bwtEndPos;
+    ulong sampleCount = 0;
      // Keeping track of text position of end-markers seen
      ulong posOfSuccEndmarker = n;
      DocId textId = numberOfTexts;
@@ -988,12 +1067,14 @@ void CSA::maketables()
        // i substitutes SA->GetPos(i)
          x=(i==n-1)?0:i+1;
  
-        if (x % samplerate == 0) {
+        if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
              Tools::SetField(sampledpositions,1,p,1);
-            (*positions)[x/samplerate] = p;
+            (*positions)[sampleCount] = p; 
+            (*tmpSuffix)[sampleCount] = x; // FIXME remove
+            sampleCount ++;
          }
  
-        uchar c = alphabetrank->charAtPos(p);
+        uchar c = alphabetrank->access(p);
          if (c == '\0')
          {
              --textId;
@@ -1033,14 +1114,66 @@ void CSA::maketables()
          assert(dynTextLength[i].second == (*textStartPos)[i]);
      }
  */
-    sampled = new BitRank(sampledpositions,n,true);   
-     
+    sampled = new BSGAP(sampledpositions,n,true);
+    sampleLength = sampled->rank(n-1);
+    assert(sampleCount == sampleLength);
+//    std::cout << ";sampleLength;" << sampleLength << std::endl;
+    // Suffixes == offset from text start position
+    suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
+    suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
+
     //suffixes:   
      for(i=0; i<sampleLength; i++) {
+        assert((*positions)[i] < n);
          j = sampled->rank((*positions)[i]);
          if (j==0) j=sampleLength;
-        (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate;
-    }   
+        TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate;
+        (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]);
+
+        assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts);
+        assert((*suffixDocId)[j-1] < numberOfTexts);
+        // calculate offset from text start:
+        (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
+    }
+    // FIXME Temp, remove
+    delete tmpSuffix;
+    tmpSuffix=0;
+    delete positions;
+    positions =0;
+    delete textLength;
+    textLength = 0;
+    delete textStartPos;
+    textStartPos = 0;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions. 
+ */
+TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
+{
+    assert(i < n);
+
+    DocId a = 0;
+    DocId b = numberOfTexts - 1;
+    while (a < b)
+    {
+        DocId c = a + (b - a)/2;
+        if ((*textStartPos)[c] > i)
+            b = c - 1;
+        else if ((*textStartPos)[c+1] > i)
+            return c;
+        else
+            a = c + 1;
+    }
+
+    assert(a < (DocId)numberOfTexts);
+    assert(i >= (*textStartPos)[a]);
+    assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
+    return a;
  }
  
  CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
diff --git a/CSA.h b/CSA.h

index bd7b199..b9a1745 100644 (file)
--- a/CSA.h
+++ b/CSA.h
@@ -20,13 +20,20 @@
  
  #ifndef _CSA_H_
  #define _CSA_H_
+#include "dynFMI.h"
+
+// Include  from XMLTree/libcds
+#include <basics.h>
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
  
-#include <map>
+#include <set>
  #include <vector>
  #include "BitRank.h"
-#include "dynFMI.h"
  #include "TextCollection.h"
  #include "BlockArray.h"
+#include "RLWaveletTree.h"
  
  /**
   * Implementation of the TextCollection interface
@@ -58,7 +65,7 @@ public:
      void MakeStatic();
      bool EmptyText(DocId) const;
      uchar* GetText(DocId) const;
-    uchar* GetText(DocId, TextPosition, TextPosition) const;
+//    uchar* GetText(DocId, TextPosition, TextPosition) const;
  
      bool IsPrefix(uchar const *) const;
      bool IsSuffix(uchar const *) const;
@@ -86,8 +93,31 @@ public:
      void Load(FILE *, unsigned);
      void Save(FILE *) const;
  
-    // debug
-    TextPosition Lookup(TextPosition) const;
+
+    // Debug FIXME Remove
+    void deleteWT()
+    {
+        delete alphabetrank;
+        alphabetrank = 0;
+        delete [] codetable;
+        codetable = 0;
+    }
+    void deleteSamples()
+    {
+        delete sampled;
+        sampled =0;
+        delete suffixes;
+        suffixes = 0;
+        delete positions;
+        positions = 0;
+        delete suffixDocId;
+        suffixDocId = 0;
+    }
+    void deleteEndmarker()
+    {
+        delete endmarkerDocId;
+        endmarkerDocId = 0;
+    }
  
  private:
      class TCodeEntry {
@@ -110,7 +140,10 @@ private:
          bool leaf;
      public:
          THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+        THuffAlphabetRank(FILE *);
          ~THuffAlphabetRank();
+        
+        void Save(FILE *);
          bool Test(uchar *, TextPosition);
          
          inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
@@ -151,7 +184,7 @@ private:
              } 
              return true;
          }
-        inline int charAtPos(TextPosition i) const {
+        inline uchar access(TextPosition i) const {
              THuffAlphabetRank const * temp=this;
              while (!temp->leaf) {
                  if (temp->bitrank->IsBitSet(i)) {
@@ -163,7 +196,22 @@ private:
                      temp = temp->left;      
              }         
              }
-            return (int)temp->ch;
+            return temp->ch;
+        }
+
+        inline uchar charAtPos(ulong i, TextPosition *rank) const {
+            THuffAlphabetRank const * temp=this;
+            while (!temp->leaf) {
+                if (temp->bitrank->IsBitSet(i)) {
+                    i = temp->bitrank->rank(i)-1;
+                    temp = temp->right;
+                } else {
+                    i = i-temp->bitrank->rank(i);
+                    temp = temp->left;
+                }
+            }
+            (*rank)=i;
+            return temp->ch;
          }
      };
  
@@ -202,23 +250,28 @@ private:
      
      static const unsigned char print = 1;
      static const unsigned char report = 1;
+    static const uchar versionFlag;
      TextPosition n;
      unsigned samplerate;
      unsigned C[256];
      TextPosition bwtEndPos;
-    THuffAlphabetRank *alphabetrank;
-    BitRank *sampled; 
+//    THuffAlphabetRank *alphabetrank;
+    //    RLWaveletTree *alphabetrank;
+    static_sequence * alphabetrank;
+    BSGAP *sampled; 
      BlockArray *suffixes;
+    BlockArray *suffixDocId;
      BlockArray *positions;
      TCodeEntry *codetable;
      DynFMI *dynFMI;
  
      // Total number of texts in the collection
      unsigned numberOfTexts;
+    // Total number of texts including empty texts
+    unsigned numberOfAllTexts;
      // Length of the longest text
      ulong maxTextLength;
-    
-    
+
      // Array of document id's in the order of end-markers in BWT
      // Access by endmarkerDocId[rank_$(L, p) - 1].
      BlockArray *endmarkerDocId;
@@ -226,7 +279,11 @@ private:
      BlockArray *textLength;
      // Array of text starting positions (in the inserted order)
      BlockArray *textStartPos;
- 
+
+    // FIXME Replace with a more succinct data structure
+    std::set<unsigned> emptyTextId;
+    BSGAP *emptyTextRank;
+
      // Private methods
      uchar * BWT(uchar *);
      uchar * LoadFromFile(const char *);
@@ -236,12 +293,27 @@ private:
  
      // Following are not part of the public API
      DocId DocIdAtTextPos(TextPosition) const;
-    unsigned CountEndmarkers(TextPosition, TextPosition) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
      TextPosition Inverse(TextPosition) const;
      TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
      TextPosition Psi(TextPosition) const;
      uchar * Substring(TextPosition, TextPosition) const;
+    TextPosition Lookup(TextPosition) const;
+
+    /**
+     * Count end-markers in given interval
+     */
+    inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+    {
+        if (sp > ep)
+            return 0;
+
+        ulong ranksp = 0;
+        if (sp != 0)
+            ranksp = alphabetrank->rank(0, sp - 1);
+    
+        return alphabetrank->rank(0, ep) - ranksp;
+    }
  };
  
  #endif
diff --git a/GapEncode.cpp b/GapEncode.cpp

new file mode 100644 (file)

index 0000000..32102ac
--- /dev/null
+++ b/GapEncode.cpp
@@ -0,0 +1,151 @@
+/**
+ * Gap encoding scheme
+ * Niko Välimäki
+ *
+ */
+
+#include "GapEncode.h"
+#include <stdexcept>
+using namespace std;
+
+
+GapEncode::GapEncode(ulong *B, ulong u, bool freeB = false)
+{
+    ulong i, lastIndex, *bitRun, *runLength;
+    this->u = u;
+    this->numberOfRuns = 0;
+    bitRun = new ulong[u/W + 1];
+    for (i = 0; i < u/W + 1; i ++)
+        bitRun[i]  = 0;
+
+    // Collect the runs of 1- or 0-bits
+    totalLength = 0;
+    uchar curBit = 0xFF;
+    lastIndex = 0;
+    for (i = 0; i < u; i ++)
+        if (Tools::GetField(B, 1, i) != curBit)
+        {
+            if (curBit == 1)
+                totalLength += i - lastIndex;
+            else if (Tools::GetField(B, 1, i))
+                Tools::SetField(bitRun, 1, i, 1);
+
+            lastIndex = i;
+            curBit = Tools::GetField(B, 1, i);
+        }
+    // Handle the last run
+    if (curBit == 1)
+        totalLength += i - lastIndex;
+    
+    runLength = new ulong[totalLength/W + 1];
+    for (i = 0; i < totalLength/W + 1; i ++)
+        runLength[i] = 0;
+    
+    // Calculate run lengths
+    totalLength = 0;
+    curBit = 0xFF;
+    lastIndex = 0;
+    for (i = 0; i < u; i ++)
+        if (Tools::GetField(B, 1, i) != curBit)
+        {
+            if (curBit == 1)
+            {
+                totalLength += i - lastIndex;
+                Tools::SetField(runLength, 1, totalLength - 1, 1);
+            }
+            lastIndex = i;
+            curBit = Tools::GetField(B, 1, i);
+        }
+    // Handle the last run
+    if (curBit == 1)
+    {
+        totalLength += i - lastIndex;
+        Tools::SetField(runLength, 1, totalLength - 1, 1);
+    }
+    
+    // Construct BSGAP structures
+    this->bitRun = new BSGAP(bitRun, u, true, 0);
+    numberOfRuns = this->bitRun->rank(u-1);
+    this->runLength = new BSGAP(runLength, totalLength, true, 0);
+    
+    if (freeB)
+        delete [] B;
+}
+
+GapEncode::~GapEncode()
+{
+    // Free BSGAP structures
+    delete bitRun;
+    delete runLength;
+}
+
+ulong GapEncode::rank(ulong i)
+{
+    ulong pred, r = bitRun->rank(i);
+    if (r == 0)
+        return 0;
+    pred = bitRun->select(r);
+    
+    ulong bitsBeforeRun = 0;
+    if (r > 1)
+        bitsBeforeRun = runLength->select(r - 1) + 1;
+    
+    ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+    if (i - pred >= bitsInRun)
+        return bitsBeforeRun + bitsInRun;
+    
+    return bitsBeforeRun + i - pred + 1;
+}
+
+ulong GapEncode::rank0(ulong i)
+{
+    return i - rank(i) + 1;
+}
+
+ulong GapEncode::select(ulong i)
+{
+    throw logic_error("GapEncode::select() is not implemented");
+    return 0;
+}
+
+ulong GapEncode::select0(ulong i)
+{
+    throw logic_error("GapEncode::select0() is not implemented");
+    return 0; // To be implemented...
+}
+
+/**
+ * Saving data fields:
+    ulong u;                // Universe size, number of bits in B
+    ulong numberOfRuns;     // Number of 1-bit runs
+    BSGAP *bitRun,          // Contains 1-bit at the start of each 1-bit-run
+          *runLength;       // Contains run lengths for each 1-bit-run
+    ulong totalLength;      // Total length of runs
+*/
+void GapEncode::Save(FILE *file) const
+{
+    if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("GapEncode::Save(): file write error (u).");
+    if (std::fwrite(&(this->numberOfRuns), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("GapEncode::Save(): file write error (numberOfRuns).");
+
+    bitRun->Save(file);
+    runLength->Save(file);
+
+    if (std::fwrite(&(this->totalLength), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("GapEncode::Save(): file write error (totalLength).");
+}
+
+GapEncode::GapEncode(FILE *file)
+{
+    if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("GapEncode::Load(): file read error (u).");
+    if (std::fread(&(this->numberOfRuns), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("GapEncode::Load(): file read error (numberOfRuns).");
+
+    bitRun = new BSGAP(file);
+    runLength = new BSGAP(file);
+
+    if (std::fread(&(this->totalLength), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("GapEncode::Load(): file read error (totalLength).");
+}
diff --git a/GapEncode.h b/GapEncode.h

new file mode 100644 (file)

index 0000000..a73e595
--- /dev/null
+++ b/GapEncode.h
@@ -0,0 +1,113 @@
+/**
+ * Gap encoding scheme
+ * Niko Välimäki
+ *
+ *
+ */
+
+#ifndef _GapEncode_H_
+#define _GapEncode_H_
+
+#include "Tools.h"
+#include "BlockArray.h"
+#include "BSGAP.h"
+
+class GapEncode
+{
+public:
+    GapEncode(ulong *, ulong, bool);
+    GapEncode(FILE *);
+    ~GapEncode();
+    
+    void Save(FILE *) const;
+    ulong rank(ulong);
+    ulong rankrun(ulong i) {return bitRun->rank(i);};
+    ulong selectrun(ulong i) {return bitRun->select(i);};
+    ulong rank0(ulong);
+    ulong select(ulong);
+    ulong select0(ulong); // Not implemented, yet...
+    bool IsBitSet(ulong i) {
+        ulong r = bitRun->rank(i);
+        if (r == 0)
+            return false;
+        ulong pred = bitRun->select(r);
+
+        ulong bitsBeforeRun = 0;
+        if (r > 1)
+            bitsBeforeRun = runLength->select(r - 1) + 1;
+
+        ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+        if (i - pred >= bitsInRun)
+            return false;
+        
+        return true;
+    }
+
+    /**
+     * Returns also the rank_1 of given position i
+     */
+    bool IsBitSet(ulong i, ulong *rank) {
+        *rank = 0;
+        ulong r = bitRun->rank(i);
+        if (r == 0)
+            return false;
+        ulong pred = bitRun->select(r);
+
+        ulong bitsBeforeRun = 0;
+        if (r > 1)
+            bitsBeforeRun = runLength->select(r - 1) + 1;
+        
+        ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+        if (i - pred >= bitsInRun)
+        {
+            *rank = bitsBeforeRun + bitsInRun;
+            return false;
+        }
+        
+        *rank = bitsBeforeRun + i - pred + 1;
+        return true;
+    }
+
+    /**
+     * Returns also the rank_1 of given position i
+     * and the length of the trailing 0/1-run.
+     */
+    bool IsBitSet(ulong i, ulong *rank, ulong *runl) {
+        *rank = 0;
+        ulong r = bitRun->rank(i);
+        if (r == 0)
+        {
+            // Length of 0-bit run after pos i
+            *runl = bitRun->select(1) - i - 1;
+            return false;
+        }
+        ulong pred = bitRun->select(r);
+
+        ulong bitsBeforeRun = 0;
+        if (r > 1)
+            bitsBeforeRun = runLength->select(r - 1) + 1;
+        
+        ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+        if (i - pred >= bitsInRun)
+        {
+            *rank = bitsBeforeRun + bitsInRun;
+            *runl = u - i - 1;
+            if (r < numberOfRuns)
+                *runl = bitRun->select(r+1) - i - 1;
+            return false;
+        }
+                
+        *rank = bitsBeforeRun + i - pred + 1;
+        *runl = bitsInRun - (i - pred) - 1;
+        return true;
+    }
+
+private:
+    ulong u;                // Universe size, number of bits in B
+    ulong numberOfRuns;     // Number of 1-bit runs
+    BSGAP *bitRun,          // Contains 1-bit at the start of each 1-bit-run
+          *runLength;       // Contains run lengths for each 1-bit-run
+    ulong totalLength;      // Total length of runs
+};
+
+#endif
diff --git a/RLWaveletTree.cpp b/RLWaveletTree.cpp

new file mode 100644 (file)

index 0000000..dd043cb
--- /dev/null
+++ b/RLWaveletTree.cpp
@@ -0,0 +1,383 @@
+/***************************************************************************
+ *   Copyright (C) 2007 by Veli Mäkinen                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.         *
+ ***************************************************************************/
+ 
+#include "RLWaveletTree.h"
+using namespace std;
+
+void RLWaveletTree::remapAlphabet(uchar *text, ulong n)
+{
+    ulong *chars = new ulong[256/W+1];
+    ulong i;
+    for (i = 0; i < 256/W+1; ++i)
+        chars[i] = 0;
+    // Set up flags:
+    for (i = 0; i < n; ++i)
+        Tools::SetField(chars, 1, text[i], 1);
+    charMap = new BitRank(chars, 256, true);
+
+    // Remap text
+    for (i = 0; i < n; ++i)
+        text[i] = charMap->rank(text[i])-1;
+}
+
+RLWaveletTree::RLWaveletTree(uchar *D, ulong n) {
+    remapAlphabet(D, n);
+//    std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl;
+
+    this->n = n;
+    this->maxlevel = Tools::CeilLog2(charMap->rank(255));
+
+    ulong i, k, value;
+    ulong** bucket = new ulong*[maxlevel+1];
+    ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1];
+    for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++)
+        wtree[i] = 0lu;
+
+   // allocate memory for storing the locations of boudaries 
+    for (k=0; k<=maxlevel; k++) {
+        bucket[k] = (ulong *)new ulong[(ulong)1 << k];
+        for (i=0; i<((ulong)1 << k); i++)
+            bucket[k][i]=0;
+    }
+
+   // compute the sizes of buckets
+    for (i=0; i<n; i++) {
+        value = D[i];
+        for (k=0; k<=maxlevel; k++)
+            bucket[k][value >> (maxlevel-k)]++;
+    }
+
+   // cumulate the bucket sizes into boundary locations 
+    for (k=0; k<=maxlevel; k++) {
+        for (i=1; i<((ulong)1 << k); i++)
+            bucket[k][i]+=bucket[k][i-1];
+        for (i=((ulong)1 << k)-1; i>=1; i--)
+            bucket[k][i]=bucket[k][i-1];
+        bucket[k][0]=0;
+    }
+
+   //store the leaf information
+    leaves = new ulong[((ulong)1 << maxlevel)+1];
+    for (k=0; k<(ulong)1<<maxlevel; k++)
+        leaves[k] = bucket[maxlevel][k];
+    leaves[k]=n;
+
+   // compute wtree using the information about boundaries 
+    for (i=0; i<n; i++) {
+        value = D[i];
+        for (k=0; k<maxlevel; k++)
+            Tools::SetField(wtree,1,k*n+(bucket[k][value >> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1);
+    }
+    delete [] D;
+    // deallocate boundaries
+    for (k=0; k<=maxlevel; k++)
+        delete[] bucket[k];
+    delete[] bucket;
+   
+    bitrank = new GapEncode(wtree, n*maxlevel, true);
+    prevRunLen = 0;
+    prevChar = 0;
+    prevPos = 0;
+}
+
+RLWaveletTree::~RLWaveletTree() {
+    delete charMap;
+    delete [] leaves;
+    delete bitrank;
+}
+
+ulong RLWaveletTree::rank(ulong c, ulong i) {
+//    if (i >= n) FIXME Replace with assert?
+//        std::cout << "RLWT::rank parameter i out of range" << std::endl;
+   ulong k;
+   ulong j=i;
+   ulong left = 0, right = (ulong)1 << maxlevel;
+   ulong before=0;
+   c = charMap->rank(c)-1;
+   
+   // first round separately to avoid extra "if"  
+   if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right
+      j = bitrank->rank(j)-1;
+      left = right>>1; 
+   }
+   else { // go to the left
+      j -= bitrank->rank(j);
+      right = right>>1;
+   }
+   // then the next without the extra if
+   for (k=1;k<maxlevel;k++) {
+      before = bitrank->rank(n*k+leaves[left]-1);
+      if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right
+         j = bitrank->rank(n*k+leaves[left]+j)-before-1;
+         left = (right+left)>>1; 
+      }
+      else { // go to the left
+         j -= bitrank->rank(n*k+leaves[left]+j)-before;
+        right = (right+left)>>1;
+      }
+      // can never happen: if (left > right) return 0;
+   }
+   return j+1;   
+}
+
+ulong RLWaveletTree::select(ulong c, ulong j) {
+    c = charMap->rank(c)-1;
+   ulong left=c, right=c+1;
+   ulong k, i=j-1, before;
+   for (k=maxlevel-1; k>0;--k) {
+       if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent
+          left -= right-left;
+         before = bitrank->rank(n*k+leaves[left]-1);
+         i = bitrank->select(before+i+1)-n*k-leaves[left];
+       }
+       else { // left side of parent
+          right +=right-left;
+          before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1);
+         i = bitrank->select0(before+i+1)-n*k-leaves[left];
+       }                         
+   }
+   // last level separately to avoid one extra if
+   if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent
+      return bitrank->select(i+1);
+   else // left side of parent
+      return bitrank->select0(i+1);
+}
+
+bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) {
+   return charAtPos(i)==c;
+}
+
+
+ulong RLWaveletTree::charAtPos(ulong i) 
+{
+//    if (i >= n) // FIXME Replace with assert?
+//        std::cout << "RLWT::charAtPos parameter i out of range" << std::endl;
+    ulong k;
+    ulong j=i;
+    ulong left = 0, right = (ulong)1 << maxlevel;
+    ulong before=0;
+    ulong c = 0;
+    ulong rank_tmp = 0;
+    // first round separately to avoid extra "if"  
+    if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right
+        c = c | ((ulong)1 << (maxlevel-1));
+        j = rank_tmp-1; //bitrank->rank(j)-1;
+        left = right>>1; 
+    }
+    else { // go to the left
+        // c = c
+        j -= rank_tmp; // bitrank->rank(j);
+        right = right>>1;
+    }
+    // then the next without the extra if
+    for (k=1;k<maxlevel;k++) {
+        before = bitrank->rank(n*k+leaves[left]-1);
+        if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
+            c = c | ((ulong)1 << (maxlevel-k-1));
+            j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+            left = (right+left)>>1; 
+        }
+        else { // go to the left
+            // c = c
+            j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+            right = (right+left)>>1;
+        }
+        // can never happen: if (left > right) return 0;
+    }
+    return charMap->select(c+1);         
+}
+
+/* returns also the rank-1 of char at given position i */
+ulong RLWaveletTree::charAtPos(ulong i, ulong *rank) 
+{
+//    if (i >= n) // FIXME Replace with assert?
+//        std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
+    ulong k;
+    ulong j=i;
+    ulong left = 0, right = (ulong)1 << maxlevel;
+    ulong before=0;
+    ulong c = 0;
+    ulong rank_tmp = 0;
+    // first round separately to avoid extra "if"  
+    if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ?
+        c = c | ((ulong)1 << (maxlevel-1));
+        j = rank_tmp-1; //bitrank->rank(j)-1;
+        left = right>>1; 
+    }
+    else { // go to the left
+        // c = c
+        j -= rank_tmp; //bitrank->rank(j);
+        right = right>>1;
+    }
+    // then the next without the extra if
+    for (k=1;k<maxlevel;k++) {
+        before = bitrank->rank(n*k+leaves[left]-1);
+        if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
+            c = c | ((ulong)1 << (maxlevel-k-1));
+            j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+            left = (right+left)>>1; 
+        }
+        else { // go to the left
+            // c = c
+            j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+            right = (right+left)>>1;
+        }
+        // can never happen: if (left > right) return 0;
+    }
+    *rank = j;
+    return charMap->select(c+1);         
+}
+
+/* returns also the rank-1 of char at given position i
+ * and trailing char-run length */
+ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank) 
+{
+    // Check for buffered data from previous call
+    if (i > prevPos && i <= prevPos + prevRunLen)
+    {
+        *rank = prevRank + (i-prevPos);
+        return prevChar;
+    }
+
+    prevPos = i;
+//    if (i >= n) // FIXME Replace with assert?
+//        std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
+    ulong k;
+    ulong j=i;
+    ulong left = 0, right = (ulong)1 << maxlevel;
+    ulong before=0;
+    ulong c = 0;
+    ulong rank_tmp = 0;
+    ulong runl_tmp = 0;
+    // first round separately to avoid extra "if"  
+    if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) {
+        if (j+prevRunLen > n)
+            prevRunLen = n-j-1;
+        c = c | ((ulong)1 << (maxlevel-1));
+        j = rank_tmp-1; //bitrank->rank(j)-1;
+        left = right>>1; 
+    }
+    else { // go to the left
+        // c = c
+        if (j+prevRunLen > n)
+            prevRunLen = n-j-1;
+        j -= rank_tmp; //bitrank->rank(j);
+        right = right>>1;
+    }
+
+    // then the next without the extra if
+    for (k=1;k<maxlevel;k++) {
+        before = bitrank->rank(n*k+leaves[left]-1);
+        if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right
+            if (leaves[left]+j+runl_tmp > leaves[right])
+                runl_tmp = leaves[right] - (leaves[left]+j) - 1;
+            if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
+                prevRunLen = runl_tmp;
+            c = c | ((ulong)1 << (maxlevel-k-1));
+            j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+            left = (right+left)>>1; 
+        }
+        else { // go to the left
+            if (leaves[left]+j+runl_tmp > leaves[right])
+                runl_tmp = leaves[right] - (leaves[left]+j) - 1;
+            if (prevRunLen > runl_tmp)  // FIXME Better way to keep track of a run?
+                prevRunLen = runl_tmp;
+            // c = c
+            j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+            right = (right+left)>>1;
+        }
+        // can never happen: if (left > right) return 0;
+    }
+    *rank = j;
+    prevRank = j;
+    prevChar = charMap->select(c+1);
+    return charMap->select(c+1);         
+}
+
+/**
+ * Saving data fields:
+       ulong n;
+       ulong maxlevel;
+       ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
+        GapEncode *bitrank;
+        BitRank *charMap;
+*/
+void RLWaveletTree::Save(FILE *file) const
+{
+    if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Save(): file write error (n).");
+    if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel).");
+
+    for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
+    {
+        if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves).");
+    }
+
+    bitrank->Save(file);
+
+    // Silly
+    ulong *chars = new ulong[256/W+1];
+    // Set up flags:
+    for (ulong i = 0; i < 256; ++i)
+        Tools::SetField(chars, 1, i, charMap->IsBitSet(i));
+
+    for (ulong offset = 0; offset < 256/W+1; ++offset)
+    {
+        if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Save(): file write error (chars).");
+    }
+
+    delete [] chars;
+}
+
+/**
+ * Load from file
+ */
+RLWaveletTree::RLWaveletTree(FILE *file)
+{
+    if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Load(): file read error (n).");
+    if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
+        throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel).");
+
+    for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
+    {
+        if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves).");
+    }
+
+    bitrank = new GapEncode(file);
+
+    // Silly
+    ulong *chars = new ulong[256/W+1];
+    // Read flags
+    for (ulong offset = 0; offset < 256/W+1; ++offset)
+    {
+        if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1)
+            throw std::runtime_error("RLWaveletTree::Load(): file read error (chars).");
+    }
+    charMap = new BitRank(chars, 256, true);
+
+    prevRunLen = 0; // Init "buffer" values
+    prevChar = 0;
+    prevPos = 0;
+}
diff --git a/RLWaveletTree.h b/RLWaveletTree.h

new file mode 100644 (file)

index 0000000..f3ed9ed
--- /dev/null
+++ b/RLWaveletTree.h
@@ -0,0 +1,64 @@
+/***************************************************************************
+ *   Copyright (C) 2007 by Veli Mäkinen                                    *
+ *                                                                         *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.         *
+ ***************************************************************************/
+
+/*
+ * Wavelet tree by Veli Mäkinen,
+ */
+
+#ifndef _RLWAVELETTREE_H_
+#define _RLWAVELETTREE_H_
+#include "GapEncode.h"
+#include "Tools.h"
+#include "BitRank.h"
+
+class RLWaveletTree {
+private:
+       ulong n;
+       ulong maxlevel;
+       ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
+        GapEncode *bitrank;
+        BitRank *charMap;
+        void remapAlphabet(uchar*, ulong);
+
+        ulong prevRunLen; // Used by charAtPosRun()
+        ulong prevPos;
+        ulong prevChar;
+        ulong prevRank;
+public:            
+    RLWaveletTree(uchar *, ulong);
+    RLWaveletTree(FILE *);
+    ~RLWaveletTree();
+
+    void Save(FILE *) const;
+   ulong rank(ulong c, ulong i);
+    
+   ulong select(ulong c, ulong j);
+        
+   bool IsCharAtPos(ulong c, ulong i);
+
+   ulong charAtPos(ulong i);
+   ulong charAtPos(ulong i, ulong *rank);
+   ulong charAtPosRun(ulong i, ulong *rank);
+   double spaceInBits() {
+       return 0; // Not implemented
+   }
+};
+
+#endif
diff --git a/TextCollection.h b/TextCollection.h

index 6a4eb1c..ef0eb72 100644 (file)
--- a/TextCollection.h
+++ b/TextCollection.h
@@ -105,7 +105,7 @@ namespace SXSI
           *
           * Note: Parameters i and j are text positions inside the k'th text.
           */
-        virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
+//        virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
          /**
           * Returns backwards (reverse) iterator to the end of i'th text
           * 
@@ -164,6 +164,12 @@ namespace SXSI
          typedef std::vector<std::pair<DocId, TextPosition> > full_result;
          virtual full_result FullContains(uchar const *) const = 0;
  
+
+        /**
+         *Debug
+         *
+         */
+        virtual TextPosition Lookup(TextPosition) const = 0;
      protected:
          // Protected constructor; call the static function InitTextCollection().
          TextCollection() { };
diff --git a/Tools.h b/Tools.h

index 45647e8..4def5e8 100644 (file)
--- a/Tools.h
+++ b/Tools.h
@@ -25,8 +25,13 @@
  #   define W 32
  #endif
  
+#ifndef WW
  #define WW (W*2)
+#endif
+
+#ifndef Wminusone
  #define Wminusone (W-1)
+#endif
  
  #ifndef uchar
  #define uchar unsigned char
diff --git a/makefile b/makefile

index 737484a..36094fe 100644 (file)
--- a/makefile
+++ b/makefile
@@ -1,7 +1,9 @@
  CC = g++
-CPPFLAGS = -Wall -ansi -pedantic -g -O2 -DNDEBUG
+LIBCDSPATH = ../libcds
+CPPFLAGS = -Wall -ansi -g -O2 -I$(LIBCDSPATH)/includes/ -DNDEBUG
+LIBCDSA = $(LIBCDSPATH)/lib/libcds.a
  
-testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o 
+testTextCollection_obs = testTextCollection5.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA}
  
  all: $(testTextCollection_obs)
author	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Sat, 7 Feb 2009 17:00:01 +0000 (17:00 +0000)
committer	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Sat, 7 Feb 2009 17:00:01 +0000 (17:00 +0000)
BSGAP.cpp	[new file with mode: 0644]	patch \| blob
BSGAP.h	[new file with mode: 0644]	patch \| blob
BitRank.cpp		patch \| blob \| history
BitRank.h		patch \| blob \| history
BlockArray.h		patch \| blob \| history
CSA.cpp		patch \| blob \| history
CSA.h		patch \| blob \| history
GapEncode.cpp	[new file with mode: 0644]	patch \| blob
GapEncode.h	[new file with mode: 0644]	patch \| blob
RLWaveletTree.cpp	[new file with mode: 0644]	patch \| blob
RLWaveletTree.h	[new file with mode: 0644]	patch \| blob
TextCollection.h		patch \| blob \| history
Tools.h		patch \| blob \| history
makefile		patch \| blob \| history