--- /dev/null
+/**
+ * Binary-searchable gap encoding scheme (BSGAP)
+ * Niko Välimäki
+ *
+ * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp
+ *
+ * Based on:
+ *
+ * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
+ * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
+ */
+
+#include "BSGAP.h"
+#include <stdexcept>
+#include <cassert>
+using namespace std;
+
+
+const uchar BSGAP::bit_table[] = {
+ 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,
+ 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,
+ 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,
+ 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,
+ 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,
+ 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,
+ 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1};
+
+/**
+ * samplerate == number of keys in each gap encoded binary search tree
+ * default value is log^2 n where n is the number of keys.
+ */
+BSGAP::BSGAP(ulong *B, ulong u, bool freeB, ulong sampleRate)
+{
+ this->u = u;
+ this->n = 0;
+ this->P = 0;
+ this->offsetA = 0;
+ this->firstKeyA = 0;
+ this->topCount = 0;
+
+ // Count keys
+ for (ulong i = 0; i < u; i ++)
+ if (Tools::GetField(B, 1, i))
+ this->n ++;
+
+ if (n == 0) // Sanity check
+ {
+ if (freeB && B)
+ delete [] B;
+
+ return;
+ }
+
+ unsigned log2n = Tools::FloorLog2(this->n);
+ if (log2n < 2)
+ log2n = 2;
+ if (sampleRate == 0)
+ this->b = log2n * log2n;
+ else
+ this->b = sampleRate;
+
+ ulong firstKey = 0, lastKey;
+ // Find the first key
+ while (!Tools::GetField(B, 1, firstKey))
+ firstKey ++;
+
+ // Temp array of BSGAP structures
+ ulong **tempP = new ulong * [n / b + 1];
+ ulong *bitsInBSD = new ulong [n / b + 1];
+ ulong totalBits = 0;
+ this->firstKeyA = new BlockArray(n/b + 1, Tools::CeilLog2(this->u));
+
+ for (ulong i = 0; i < n/b; i ++)
+ {
+ // Find the last key
+ lastKey = firstKey;
+ for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++)
+ if (Tools::GetField(B, 1, lastKey))
+ k ++;
+ lastKey --;
+ // Sanity check
+ if (lastKey > u)
+ {
+ cout << "error: lastKey = " << lastKey << ", u = " << u << endl;
+ exit(0);
+ }
+
+ // lastKey of the last substructure is this->u
+ if (i == n/b - 1 && n%b == 0)
+ lastKey = u;
+
+ tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]); // FIXME: left of right subtree?
+ totalBits += bitsInBSD[i];
+ (*firstKeyA)[i] = firstKey;
+
+ firstKey = lastKey;
+ }
+
+ if (n - (n/b) * b != 0)
+ {
+ // Find the first key
+ while (!Tools::GetField(B, 1, firstKey))
+ firstKey ++;
+
+ tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]); // FIXME: left of right subtree?
+ totalBits += bitsInBSD[n/b];
+ (*firstKeyA)[n/b] = firstKey;
+ }
+
+ // Number of nodes:
+ this->topCount = n%b ? n/b + 1: n/b;
+
+ // Catenate binary trees into one bitvector
+ this->P = new ulong[totalBits/W + 1];
+ this->bitsInP = totalBits;
+ ulong offset = 0;
+ for (ulong i = 0; i < this->topCount ; ++i)
+ {
+ BitCopy(P, tempP[i], offset, bitsInBSD[i]);
+ delete [] tempP[i];
+ offset += bitsInBSD[i];
+ }
+ delete [] tempP;
+
+ // Pointers to bit vector P (binary tree starting offsets)
+ // FIXME Use more succinct representation?
+ offset = 0;
+ this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits));
+ for (ulong i = 0; i < this->topCount ; ++i)
+ {
+ (*offsetA)[i] = offset;
+ offset += bitsInBSD[i];
+ }
+ delete [] bitsInBSD;
+
+ if (freeB)
+ delete [] B;
+}
+
+BSGAP::~BSGAP()
+{
+ delete [] P;
+ delete offsetA;
+ delete firstKeyA;
+}
+
+ulong BSGAP::rank(ulong i)
+{
+ if (n == 0) // Trivial case
+ return 0;
+ if (i >= u)
+ throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
+ if ((*firstKeyA)[0] > i)
+ return 0;
+ ulong l = 0, r = topCount - 1;
+ while (l < r) // Binary search on the array firstKeyA
+ {
+ ulong mid = l + (r - l)/2lu;
+ if ((*firstKeyA)[mid] < i)
+ l = mid + 1;
+ else
+ r = mid;
+ }
+ if ((*firstKeyA)[l] > i && l == 0)
+ return 0;
+ if ((*firstKeyA)[l] == i)
+ return 1 + l * b;
+ if ((*firstKeyA)[l] > i)
+ l --;
+
+ ulong lastKey, firstKey = (*firstKeyA)[l];
+ if (l == topCount - 1)
+ lastKey = u;
+ else
+ lastKey = (*firstKeyA)[l + 1];
+ ulong nSub = b;
+ if (l == topCount - 1 && n%b)
+ nSub = n%b;
+
+ ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
+ // Add rank of substructures P[0], P[1], ..., P[j - 2]
+ return l * b + k;
+}
+
+ulong BSGAP::rank0(ulong i)
+{
+ return i - rank(i) + 1;
+}
+
+bool BSGAP::IsBitSet(ulong i)
+{
+ if (n == 0) // Trivial case
+ return false;
+ if (i >= u)
+ return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
+ if ((*firstKeyA)[0] > i)
+ return false;
+ ulong l = 0, r = topCount - 1;
+ while (l < r) // Binary search on the array firstKeyA
+ {
+ ulong mid = l + (r - l)/2lu;
+ if ((*firstKeyA)[mid] < i)
+ l = mid + 1;
+ else
+ r = mid;
+ }
+ if ((*firstKeyA)[l] > i && l == 0)
+ return false;
+ if ((*firstKeyA)[l] == i)
+ return true;
+ if ((*firstKeyA)[l] > i)
+ l --;
+
+ ulong lastKey, firstKey = (*firstKeyA)[l];
+ if (l == topCount - 1)
+ lastKey = u;
+ else
+ lastKey = (*firstKeyA)[l + 1];
+ ulong nSub = b;
+ if (l == topCount - 1 && n%b)
+ nSub = n%b;
+
+ return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
+}
+
+/**
+ * Returns also rank_1(i) via the second parameter.
+ */
+bool BSGAP::IsBitSet(ulong i, ulong *resultRank)
+{
+ *resultRank = 0;
+ if (n == 0) // Trivial case
+ return false;
+ if (i >= u)
+ {
+ *resultRank = n;
+ return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
+ }
+ if ((*firstKeyA)[0] > i)
+ return false;
+ ulong l = 0, r = topCount - 1;
+ while (l < r) // Binary search on the array firstKeyA
+ {
+ ulong mid = l + (r - l)/2lu;
+ if ((*firstKeyA)[mid] < i)
+ l = mid + 1;
+ else
+ r = mid;
+ }
+ if ((*firstKeyA)[l] > i && l == 0)
+ return false;
+ if ((*firstKeyA)[l] == i)
+ {
+ *resultRank = 1 + l * b;
+ return true;
+ }
+ if ((*firstKeyA)[l] > i)
+ l --;
+
+ ulong lastKey, firstKey = (*firstKeyA)[l];
+ if (l == topCount - 1)
+ lastKey = u;
+ else
+ lastKey = (*firstKeyA)[l + 1];
+ ulong nSub = b;
+ if (l == topCount - 1 && n%b)
+ nSub = n%b;
+
+ bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank);
+ *resultRank += l * b;
+ return result;
+}
+
+
+ulong BSGAP::select(ulong i)
+{
+ if (n == 0)
+ return 0;
+ if (i == 0)
+ return 0;
+ if (i > n)
+ throw std::out_of_range("BSGAP::select(): Parameter i was out of range");
+
+ ulong j = i/b;
+ if (i%b == 0)
+ j --;
+ ulong lastKey, firstKey = (*firstKeyA)[j];
+ if (j == topCount - 1)
+ lastKey = u;
+ else
+ lastKey = (*firstKeyA)[j + 1];
+ ulong nSub = b;
+ if (j == topCount - 1 && n%b)
+ nSub = n%b;
+
+ return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true);
+}
+
+ulong BSGAP::select0(ulong i)
+{
+ if (n == 0) // Trivial case
+ return i-1;
+ if (i > u - n)
+ throw std::out_of_range("BSGAP::select0(): Parameter i was out of range");
+
+ ulong l = 0, r = topCount - 1;
+ while (l < r) // Binary search on the array firstKeyA
+ {
+ ulong mid = l + (r - l)/2lu;
+ if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid]
+ l = mid + 1;
+ else
+ r = mid;
+ }
+ if (l == 0 && (*firstKeyA)[l] >= i)
+ return i-1;
+ if (l > 0 && (*firstKeyA)[l] - l*b >= i)
+ --l; // FIXME: TEST
+
+ i -= (*firstKeyA)[l] - l*b;
+
+ ulong lastKey, firstKey = (*firstKeyA)[l];
+ if (l == topCount - 1)
+ lastKey = u;
+ else
+ lastKey = (*firstKeyA)[l + 1];
+ ulong nSub = b;
+ if (l == topCount - 1 && n%b)
+ nSub = n%b;
+
+ return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
+}
+
+ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+ while (1)
+ {
+ if (n == 0)
+ return l+i;
+
+ // Check if subtree is full --- FIXME Make this test an assert()!
+ if (IsSubtreeFull(firstKey, l, r, n))
+ throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i);
+
+ // Parse key value
+ ulong key;
+ ulong x = DeltaDecode(B, offset);
+ if (leftChild)
+ key = r - x;
+ else
+ key = l + x;
+
+ if (numberOfZeros(firstKey, l, key, n) < i)
+ {
+ offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+ i = i - numberOfZeros(firstKey, l, key, n);
+ l = key;
+ n = n - n/2 - 1;
+ leftChild = false;
+ }
+ else
+ {
+ offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+ r = key;
+ n = n/2;
+ leftChild = true;
+ }
+ }
+}
+
+
+ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+ // Number of keys in subtrees on left-side
+ ulong result = 0;
+
+ while (1)
+ {
+
+ if (n == 0)
+ return result;
+
+ // Check if subtree is full
+ if (IsSubtreeFull(firstKey, l, r, n))
+ return result + (l == firstKey ? i - l + 1 : i - l);
+
+ // Parse key value
+ ulong key;
+ ulong x = DeltaDecode(B, offset);
+ if (leftChild)
+ key = r - x;
+ else
+ key = l + x;
+ if (key == i)
+ return result + n/2 + 1;
+
+ if (key < i)
+ {
+ offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+ result += n/2 + 1;
+ l = key;
+ n = n - n/2 - 1;
+ leftChild = false;
+ }
+ else
+ {
+ offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+ r = key;
+ n = n/2;
+ leftChild = true;
+ }
+ }
+}
+
+
+bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+ // Number of keys in subtrees on left-side
+ ulong result = 0;
+
+ while (1)
+ {
+
+ if (n == 0)
+ return false;
+
+ // Check if subtree is full
+ if (IsSubtreeFull(firstKey, l, r, n))
+ return result + (l == firstKey ? i - l + 1 : i - l);
+
+ // Parse key value
+ ulong key;
+ ulong x = DeltaDecode(B, offset);
+ if (leftChild)
+ key = r - x;
+ else
+ key = l + x;
+ if (key == i)
+ return true;
+
+ if (key < i)
+ {
+ offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+ result += n/2 + 1;
+ l = key;
+ n = n - n/2 - 1;
+ leftChild = false;
+ }
+ else
+ {
+ offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+ r = key;
+ n = n/2;
+ leftChild = true;
+ }
+ }
+}
+
+// Returns also the rank of i
+bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank)
+{
+ // Number of keys in subtrees on left-side
+ ulong result = 0;
+
+ while (1)
+ {
+
+ if (n == 0)
+ {
+ *resultRank = result;
+ return false;
+ }
+
+ // Check if subtree is full
+ if (IsSubtreeFull(firstKey, l, r, n))
+ {
+ *resultRank = result + (l == firstKey ? i - l + 1 : i - l);
+ return true;
+ }
+
+ // Parse key value
+ ulong key;
+ ulong x = DeltaDecode(B, offset);
+ if (leftChild)
+ key = r - x;
+ else
+ key = l + x;
+ if (key == i)
+ {
+ *resultRank = result + n/2 + 1;
+ return true;
+ }
+
+ if (key < i)
+ {
+ offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+ result += n/2 + 1;
+ l = key;
+ n = n - n/2 - 1;
+ leftChild = false;
+ }
+ else
+ {
+ offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+ r = key;
+ n = n/2;
+ leftChild = true;
+ }
+ }
+}
+
+
+ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
+{
+ while (1)
+ {
+ if (n == 0)
+ return 0;
+
+ // Check if subtree is full
+ if (IsSubtreeFull(firstKey, l, r, n))
+ return (l == firstKey ? l + i - 1: l + i);
+
+ // Parse key value
+ ulong key;
+ ulong x = DeltaDecode(B, offset);
+ if (leftChild)
+ key = r - x;
+ else
+ key = l + x;
+ if (n/2 + 1 == i)
+ return key;
+ if (n/2 + 1 < i)
+ {
+ offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
+ i = i - n/2 - 1;
+ l = key;
+ n = n - n/2 - 1;
+ leftChild = false;
+ }
+ else
+ {
+ offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
+ r = key;
+ n = n/2;
+ leftChild = true;
+ }
+ }
+}
+
+ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits)
+{
+ bits = 0;
+ if (n == 0)
+ return 0;
+
+ // Check if subtree is full
+ if (IsSubtreeFull(firstKey, l, r, n))
+ return 0;
+
+ // Pick the median
+ ulong key = FindMedian(B, firstKey, l, r, n);
+
+ ulong *leftSub, *rightSub, leftBits, rightBits;
+ leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits);
+ rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits);
+
+ // Encode key value
+ ulong *keyDelta, keyBits;
+ if (leftChild)
+ keyDelta = DeltaEncode(r - key, keyBits, true);
+ else
+ keyDelta = DeltaEncode(key - l, keyBits, true);
+
+ // Encode jump offset if left and right subtrees exists
+ ulong *jumpOffset = 0, jumpBits = 0;
+ if (leftBits != 0 && rightBits != 0)
+ jumpOffset = DeltaEncode(leftBits, jumpBits);
+
+ // bits is the sum of keyBits, jumpBits, leftBits and rightBits
+ bits = keyBits + jumpBits + leftBits + rightBits;
+ ulong *output = new ulong[bits / W + 1];
+
+ /**
+ * Write "output"
+ */
+ BitCopy(output, keyDelta, 0, keyBits);
+ delete [] keyDelta;
+ ulong offset = keyBits;
+
+ if (jumpBits != 0)
+ {
+ BitCopy(output, jumpOffset, offset, jumpBits);
+ offset += jumpBits;
+ delete [] jumpOffset;
+ }
+
+ BitCopy(output, leftSub, offset, leftBits);
+ offset += leftBits;
+ BitCopy(output, rightSub, offset, rightBits);
+ offset += rightBits;
+
+ assert(offset == bits);
+ if (leftBits != 0)
+ delete [] leftSub;
+ if (rightBits != 0)
+ delete [] rightSub;
+ return output;
+}
+
+void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len)
+{
+ if (len == 0)
+ return;
+ ulong i;
+
+ if (len <= W)
+ {
+ Tools::SetVariableField(dest, len, offset, *src);
+ return;
+ }
+
+ for (i = 0; i < len/W; i ++)
+ Tools::SetVariableField(dest, W, offset + i * W, src[i]);
+
+ if (i * W < len)
+ Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]);
+}
+
+ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n)
+{
+ // Linear scan: slow but affects only the construction time
+ n = n/2 + 1;
+ if (l != firstKey)
+ l ++; // Skip left boundary
+
+ while (n && l <= r)
+ {
+ if (Tools::GetField(B, 1, l))
+ n --;
+ l ++;
+ }
+ return l - 1;
+}
+
+ulong BSGAP::DeltaDecode(ulong *B, ulong &offset)
+{
+ // Dense coding
+ // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
+ const unsigned s = 180; // FIXME These should be class variables
+ const unsigned c = 256 - s; // FIXME Choose <s,c> values!
+ const unsigned b = 8; // one byte.
+
+ ulong i = 0;
+ ulong base = 0;
+ ulong tot = s;
+ while (Tools::GetVariableField(B, b, offset) < c)
+ {
+ i = i*c + Tools::GetVariableField(B, b, offset);
+ offset += b;
+ base += tot;
+ tot = tot * c;
+ }
+ i = i * s + (Tools::GetVariableField(B, b, offset) - c);
+ offset += b;
+ return i + base;
+}
+
+
+ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative)
+{
+ // Dense coding
+ // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
+ const unsigned s = 180; // FIXME These should be class variables
+ const unsigned c = 256 - s; // FIXME Choose <s,c> values!
+ const unsigned b = 8; // one byte
+
+ // Calculate size first:
+ bits = b;
+ ulong x = value / s;
+ while (x > 0)
+ {
+ --x;
+ bits += b;
+ x = x/c;
+ }
+
+ // bits is now the length of the whole codeword
+ ulong *B = new ulong[bits/W + 1];
+
+ // output codeword (backwards):
+ unsigned offset = bits - b;
+ ulong output = c + (value % s);
+ Tools::SetVariableField(B, b, offset, output);
+
+ x = value / s;
+ while (x > 0)
+ {
+ --x;
+ offset -= b;
+
+ output = x % c;
+ Tools::SetVariableField(B, b, offset, output);
+
+ x = x/c;
+ }
+
+ return B;
+}
+
+
+/**
+ * Saving the following data fields:
+ * ulong u, n; // Universe size, number of 1-bits in B
+ * ulong topCount; // Top structure and the number of substructures
+ * ulong bitsInP;
+ * ulong *P; // Pointer to BSGAP structures
+ * unsigned b; // Subdictionary size (\log^2 n)
+ * BlockArray *offsetA; // Array of pointers (into bitvector P)
+ * BlockArray *firstKeyA; // Array of first key positions of the substructures
+ */
+void BSGAP::Save(FILE *file) const
+{
+ if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Save(): file write error (u).");
+
+ if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Save(): file write error (n).");
+
+ if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Save(): file write error (topCount).");
+
+ if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Save(): file write error (bitsInP).");
+
+ for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
+ {
+ if (std::fwrite(this->P+offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Save(): file write error (P).");
+ }
+
+ if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Save(): file write error (b).");
+
+ offsetA->Save(file);
+ firstKeyA->Save(file);
+}
+
+/**
+ * Load from file
+ */
+BSGAP::BSGAP(FILE *file)
+{
+ if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Load(): file read error (u).");
+
+ if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Load(): file read error (n).");
+
+ if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Load(): file read error (topCount).");
+
+ if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Load(): file read error (bitsInP).");
+
+ P = new ulong[bitsInP/W+1];
+ for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
+ {
+ if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Load(): file read error (P).");
+ }
+
+ if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("BSGAP::Load(): file read error (b).");
+
+ offsetA = new BlockArray(file);
+ firstKeyA = new BlockArray(file);
+}
+
--- /dev/null
+/**
+ * Binary-searchable gap encoding scheme (BSGAP)
+ * Niko Välimäki
+ *
+ * Based on:
+ *
+ * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
+ * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
+ */
+#ifndef _BSGAP_H_
+#define _BSGAP_H_
+
+#include "Tools.h"
+#include "BlockArray.h"
+
+class BSGAP
+{
+public:
+ BSGAP(ulong *, ulong, bool = false, ulong = 0);
+ BSGAP(FILE *);
+ ~BSGAP();
+ ulong rank(ulong);
+ ulong rank0(ulong);
+ ulong rank(bool b, ulong i) {return b?rank(i):i+1-rank(i);};
+ ulong select(ulong);
+ ulong select0(ulong);
+ ulong select(bool b, ulong i) {return b?select(i):select0(i);};
+
+ bool IsBitSet(ulong pos);
+ // Returns also the rank_1(pos) via the second param
+ bool IsBitSet(ulong pos, ulong *);
+
+ ulong size() {return u;};
+ ulong bitsSet() {return n;}
+ void Save(FILE *) const;
+private:
+ static const uchar bit_table[];
+ ulong u, n; // Universe size, number of 1-bits in B
+ ulong topCount; // Top structure and the number of substructures
+ ulong bitsInP; // Length of bit vector P
+ ulong *P; // Pointer to BSGAP structures
+ unsigned b; // Subdictionary size (\log^2 n)
+ BlockArray *offsetA; // Array of pointers (into bitvector P)
+ BlockArray *firstKeyA; // Array of first key positions of the substructures
+
+ ulong rank(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+ bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+ bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool, ulong *);
+ ulong select(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+ ulong select0(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool);
+ ulong * GetSubtree(ulong *, ulong, ulong, ulong, ulong, bool, ulong &);
+ ulong DeltaDecode(ulong *, ulong &);
+ ulong DeltaDecode(ulong *, ulong &, bool &);
+ ulong * DeltaEncode(ulong, ulong &, bool = true, bool = false);
+ void BitCopy(ulong *, ulong *, ulong, ulong);
+ ulong FindMedian(ulong *, ulong, ulong, ulong, ulong);
+ unsigned FindLowestBit(ulong);
+
+ inline bool IsSubtreeFull(ulong firstKey, ulong l, ulong r, ulong n)
+ {
+ if (l == firstKey)
+ n --;
+
+ if (r - l - 1 == n)
+ return true;
+ return false;
+ }
+ /**
+ * Returns the number of zeros in the left subtree
+ */
+ inline ulong numberOfZeros(ulong firstKey, ulong l, ulong key, ulong n)
+ {
+ ulong z = key - l - 1 - n/2;
+ if (l == firstKey)
+ ++z;
+ if (z == -1lu)
+ z = 0;
+ return z;
+ }
+
+ inline ulong FindRightSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n)
+ {
+ if (n - n/2 - 1 == 0)
+ return 0; // Does not exists
+
+ if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1))
+ return 0; // Right subtree is full (does not exists)
+
+ if (n/2 == 0)
+ return offset; // Left subtree does not exists
+
+ if (IsSubtreeFull(firstKey, l, key, n/2))
+ return offset; // Left subtree is full (does not exists)
+
+ ulong len = DeltaDecode(B, offset);
+ return offset + len; // Skipping left subtree
+ }
+
+ inline ulong FindLeftSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n)
+ {
+ if (n/2 == 0)
+ return 0; // Does not exists
+
+ if (IsSubtreeFull(firstKey, l, key, n/2))
+ return 0; // Left subtree is full
+
+ if (n - n/2 - 1 == 0)
+ return offset; // Right subtree does not exists
+
+ if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1))
+ return offset; // Right subtree is full
+
+ DeltaDecode(B, offset); // Calculate new offset
+ return offset; // Left subtree is the first one
+ }
+
+};
+
+#endif
data=bitarray;
this->owner = owner;
this->n=n; // length of bitarray in bits
- b = W; // b is a word
- s=b*superFactor;
ulong aux=(n+1)%W;
if (aux != 0)
integers = (n+1)/W+1;
}
BitRank::~BitRank() {
- delete [] Rs;
+ delete Rs;
delete [] Rb;
if (owner) delete [] data;
}
void BitRank::BuildRank()
{
ulong num_sblock = n/s;
- ulong num_block = n/b;
- Rs = new ulong[num_sblock+1];//+1 we add the 0 pos
+ ulong num_block = n/W;
+ Rs = new BlockArray(num_sblock+1, Tools::CeilLog2(n));//+1 we add the 0 pos
Rb = new uchar[num_block+1];//+1 we add the 0 pos
ulong j;
- Rs[0] = 0lu;
+ (*Rs)[0] = 0lu;
for (j=1;j<=num_sblock;j++)
{
- Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1];
+ (*Rs)[j]=BuildRankSub((j-1)*superFactor,superFactor)+(*Rs)[j-1];
}
Rb[0]=0;
//this rank ask from 0 to n-1
ulong BitRank::rank(ulong i) {
++i; // the following gives sum of 1s before i
- return Rs[i>>8]+Rb[i>>wordShift]
+ return (*Rs)[i>>8]+Rb[i>>wordShift]
+popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1));
}
ulong l=0, r=n/s;
ulong mid=(l+r)/2;
- ulong rankmid = Rs[mid];
+ ulong rankmid = (*Rs)[mid];
while (l<=r) {
if (rankmid<x)
l = mid+1;
else
r = mid-1;
mid = (l+r)/2;
- rankmid = Rs[mid];
+ rankmid = (*Rs)[mid];
}
//sequential search using popcount over a int
ulong left;
ones = popcount(j);
}
//sequential search using popcount over a char
- left=left*b;
+ left=left*W;
rankmid = popcount8(j);
if (rankmid < x) {
j=j>>8;
ulong l=0, r=n/s;
ulong mid=(l+r)/2;
- ulong rankmid = mid * s - Rs[mid];
+ ulong rankmid = mid * s - (*Rs)[mid];
while (l<=r) {
if (rankmid<x)
l = mid+1;
else
r = mid-1;
mid = (l+r)/2;
- rankmid = mid * s - Rs[mid];
+ rankmid = mid * s - (*Rs)[mid];
}
//sequential search using popcount over a int
}
//sequential search using popcount over a char
- left=left*b;
+ left=left*W;
rankmid = 8 - popcount8(j);
if (rankmid < x) {
j=j>>8;
ulong BitRank::NumberOfBits() {
return n;
}
+
+/**
+ * Saving data fields:
+
+ ulong n;
+ ulong *data; //here is the bit-arra
+ */
+void BitRank::Save(FILE *file)
+{
+ if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BitRank::Save(): file write error (n).");
+
+ for (ulong offset = 0; offset < integers; ++offset)
+ if (std::fwrite(this->data + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BitRank::Save(): file write error (data).");
+}
+
+BitRank::BitRank(FILE *file)
+{
+ owner = 1;
+ if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BitRank::Load(): file read error (n).");
+
+ ulong aux=(n+1)%W;
+ if (aux != 0)
+ integers = (n+1)/W+1;
+ else
+ integers = (n+1)/W;
+
+ data = new ulong[integers];
+ for (ulong offset = 0; offset < integers; ++offset)
+ if (std::fread(this->data + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BitRank::Load(): file read error (data).");
+
+ BuildRank();
+}
#ifndef _BITSTREAM_H_
#define _BITSTREAM_H_
+#include "BlockArray.h"
#include "Tools.h"
class BitRank {
static const unsigned wordShift = 6;
static const unsigned superFactor = 4; // 256 bit blocks
#endif
+ static const unsigned s = W*superFactor;
ulong *data; //here is the bit-array
bool owner;
ulong n,integers;
- unsigned b,s;
- ulong *Rs; //superblock array
+ BlockArray *Rs; //superblock array
uchar *Rb; //block array
ulong BuildRankSub(ulong, ulong); //internal use of BuildRank
void BuildRank(); //crea indice para rank
public:
BitRank(ulong *, ulong, bool);
+ BitRank(FILE *);
~BitRank(); //destructor
+ void Save(FILE *);
ulong rank(ulong i); //Rank from 0 to n-1
ulong select(ulong x); // gives the position of the x:th 1.
ulong select0(ulong x); // gives the position of the x:th 0.
#ifndef _BLOCK_ARRAY_H_
#define _BLOCK_ARRAY_H_
-#include <iostream>
#include "Tools.h"
+#include <iostream>
+#include <stdexcept>
class BlockArray
{
ulong spaceInBits() {
return n*blockLength+W; // plus 4 ulong's
}
+
+ /**
+ * Saving data fields:
+ * ulong n;
+ * ulong blockLength;
+ * ulong* data;
+ */
+ void Save(FILE *file) const
+ {
+ if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BlockArray::Save(): file write error (n).");
+ if (std::fwrite(&(this->blockLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BlockArray::Save(): file write error (blockLength).");
+ for (ulong offset = 0; offset < n*blockLength/W+1; offset ++)
+ {
+ if (std::fwrite(this->data + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BlockArray::Save(): file write error (data).");
+ }
+ }
+
+ /**
+ * Load from file
+ */
+ BlockArray(FILE *file)
+ {
+ if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BlockArray::Load(): file read error (n).");
+ if (std::fread(&(this->blockLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BlockArray::Load(): file read error (blockLength).");
+
+ data = new ulong[n*blockLength/W+1];
+ for (ulong offset = 0; offset < n*blockLength/W+1; offset ++)
+ {
+ if (std::fread(this->data + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("BlockArray::Load(): file read error (data).");
+ }
+ }
};
#endif
using SXSI::TextCollection;
+// Save file version info
+const uchar CSA::versionFlag = 2;
////////////////////////////////////////////////////////////////////////////
// Class CSA::THuffAlphabetRank
delete bitrank;
}
+/**
+ * Saving data fields:
+ BitRank *bitrank;
+ bool leaf;
+ uchar ch;
+ left child;
+ right child;
+*/
+void CSA::THuffAlphabetRank::Save(FILE *file)
+{
+
+}
+
+CSA::THuffAlphabetRank::THuffAlphabetRank(FILE *file)
+{
+
+
+}
/////////////////////////////////////////////f///////////////////////////////
// Class CSA
* Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
*/
CSA::CSA(unsigned samplerate)
- : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
- codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0),
- textLength(0), textStartPos(0)
+ : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0),
+ codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0),
+ textLength(0), textStartPos(0), emptyTextRank(0)
{
this->samplerate = samplerate;
if (m > maxTextLength)
maxTextLength = m; // Store length of the longest text seen so far.
- this->n += m;
- this->numberOfTexts ++;
- dynFMI->addText(text, m);
+ if (m > 1)
+ {
+ this->n += m;
+ this->numberOfTexts ++;
+ this->numberOfAllTexts ++;
+ dynFMI->addText(text, m);
+ }
+ else
+ {
+ emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here
+ this->numberOfAllTexts ++; // Replace with dynamic bitvector
+ }
}
void CSA::MakeStatic()
{
// Sanity check:
if (dynFMI == 0)
+ throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0).");
+
+ // Bitvector of empty texts
{
- std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
- std::exit(0);
+ //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <<std::endl;
+ ulong *tempB = new ulong[numberOfAllTexts/W + 1];
+ for (ulong i = 0; i < numberOfAllTexts/W + 1; ++i)
+ tempB[i] = 0;
+ for (std::set<unsigned>::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it)
+ Tools::SetField(tempB, 1, (*it), 1);
+ emptyTextId.clear();
+ emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true);
}
-
+
uchar *bwt = dynFMI->getBWT();
/*printf("123456789012345678901234567890123456789\n");
for (TextPosition i = 0; i < n; i ++)
delete dynFMI;
dynFMI = 0;
- makewavelet(bwt);
+ makewavelet(bwt); // Deletes bwt!
+ bwt = 0;
// Calculate BWT end-marker position (of last inserted text)
// and the length of the first text (counter l):
ulong i = 0;
ulong l = 1;
- while (bwt[i] != '\0')
+
+ uchar c = alphabetrank->access(i);
+ while (c != '\0')
{
- uchar c = bwt[i];
i = C[c]+alphabetrank->rank(c, i)-1;
++l;
+ c = alphabetrank->access(i);
}
bwtEndPos = i;
+ assert(bwtEndPos < n);
//printf("bwtEndPos = %lu\n", bwtEndPos);
- delete [] bwt;
-
// Build up arrays for text length and starting positions
+ // FIXME Remove, temp data
textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength));
textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n));
(*textLength)[0] = l;
bool CSA::EmptyText(DocId k) const
{
- assert(k < (DocId)numberOfTexts);
- return (1 == (*textLength)[k]);
+ assert(k < (DocId)numberOfTexts);
+ if (emptyTextRank->IsBitSet(k))
+ return true;
+ return false;
}
uchar* CSA::GetText(DocId k) const
{
assert(k < (DocId)numberOfTexts);
- uchar* result = new uchar[(*textLength)[k]];
+ ulong textRank = 0;
+ if (emptyTextRank->IsBitSet(k, &textRank))
+ {
+ uchar* result = new uchar[1];
+ result[0] = 0;
+ return result;
+ }
+ // Map to non-empty text
+ k -= textRank; //emptyTextRank->rank(k);
+
TextPosition i = k;
- TextPosition j = (*textLength)[k] - 1;
- result[j] = '\0';
+ // FIXME
+ string result; //TextPosition j = (*textLength)[k] - 1;
- uchar c = alphabetrank->charAtPos(i);
+ uchar c = alphabetrank->access(i);
while (c != '\0')
{
- --j;
- result[j] = c;
+ result.push_back(c);
i = C[c]+alphabetrank->rank(c,i)-1;
- c = alphabetrank->charAtPos(i); // "next" char.
+ c = alphabetrank->access(i); // "next" char.
}
- assert(j == 0);
- return result;
+
+ // Convert to uchar (FIXME return string?)
+ i = result.size();
+ uchar* res = new uchar[i+1];
+ res[i] = '\0';
+ for (ulong j = 0; j < i; ++j)
+ res[i-j-1] = result[j];
+ return res;
}
-
+/*
+ * Not supported
uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
{
assert(k < (DocId)numberOfTexts);
assert(j < (*textLength)[k]);
assert(i <= j);
+ ulong textRank = 0;
+ if (emptyTextRank->IsBitSet(k, &textRank))
+ {
+ uchar* result = new uchar[1];
+ result[0] = 0;
+ return result;
+ }
+
+ // Map to non-empty text
+ k -= textRank; // emptyTextRank->rank(k);
+
// Start position of k'th text
ulong start = (*textStartPos)[k];
return Substring(i + start, j-i+1);
-}
+ }*/
/******************************************************************
* Existential queries
bool CSA::IsSuffix(uchar const *pattern) const
{
- // Here counting is as fast as isSuffix():
+ // Here counting is as fast as IsSuffix():
if (CountSuffix(pattern) > 0)
return true;
return false;
bool CSA::IsEqual(uchar const *pattern) const
{
TextPosition m = std::strlen((char *)pattern);
- //if (m == 0)
- // return false; // FIXME Check for empty texts!
+ if (m == 0)
+ {
+ if (numberOfAllTexts - numberOfTexts > 0u)
+ return true; // Empty texts exists
+ return false; // No empty texts exists
+ }
TextPosition sp = 0, ep = 0;
// Match including end-marker
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return numberOfTexts;
+ return numberOfAllTexts;
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return numberOfTexts;
+ return numberOfAllTexts;
TextPosition sp = 0, ep = 0;
// Search with end-marker
unsigned CSA::CountEqual(uchar const *pattern) const
{
TextPosition m = strlen((char const *)pattern);
-// if (m == 0)
-// return ; // FIXME Test for empty texts.
+ if (m == 0)
+ return numberOfAllTexts - numberOfTexts; // Empty texts.
TextPosition sp = 0, ep = 0;
// Match including end-marker
unsigned CSA::CountContains(uchar const * pattern) const
{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return numberOfAllTexts; // Total number of texts.
+
// Here counting is as slow as fetching the result set
// because we would have to filter out occ's that fall within same document.
TextCollection::document_result result = Contains(pattern);
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return TextCollection::document_result();
+ return TextCollection::document_result(); // FIXME Should return all 1...k
TextPosition sp = 0, ep = 0;
Search(pattern, m, &sp, &ep);
if (resultSize == 0)
return result;
+ result.reserve(resultSize); // Try to avoid reallocation.
+
+ // Iterate through end-markers in [sp,ep]:
unsigned i = 0;
if (sp > 0)
i = alphabetrank->rank(0, sp - 1);
{
// End-marker we found belongs to the "preceeding" doc in the collection
DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
result.push_back(docId);
+
-- resultSize;
++ i;
}
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return TextCollection::document_result();
+ return TextCollection::document_result(); // FIXME Should return all 1...k
TextPosition sp = 0, ep = 0;
// Search with end-marker
Search(pattern, m+1, &sp, &ep);
TextCollection::document_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+ ulong sampled_rank_i = 0;
// Check each occurrence
for (; sp <= ep; ++sp)
{
TextPosition i = sp;
- uchar c = alphabetrank->charAtPos(i);
- while (c != '\0' && !sampled->IsBitSet(i))
+
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
{
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->charAtPos(i);
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
}
+ // Assert: c == '\0' OR sampled->IsBitSet(i)
+
if (c == '\0')
{
// Rank among the end-markers in BWT
unsigned endmarkerRank = alphabetrank->rank(0, i) - 1;
// End-marker that we found belongs to the "preceeding" doc in collection:
- DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
result.push_back(docId);
}
else // Sampled position
{
- TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
- result.push_back(DocIdAtTextPos(textPos));
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(docId);
}
}
{
TextPosition m = strlen((char const *)pattern);
if (m == 0)
- return TextCollection::document_result();
+ return TextCollection::document_result(); // FIXME Should return all empty texts
TextPosition sp = 0, ep = 0;
// Match including end-marker
if (resultSize == 0)
return result;
+ result.reserve(resultSize); // Try to avoid reallocation.
+
unsigned i = 0;
if (sp > 0)
i = alphabetrank->rank(0, sp - 1);
while (resultSize)
{
- // End-marker we found belongs to the "previous" doc in the collection
+ // End-marker we found belongs to the "previous" doc in the collection
DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
result.push_back(docId);
+
-- resultSize;
++ i;
}
TextPosition sp = 0, ep = 0;
// Search all occurrences
Search(pattern, m, &sp, &ep);
-
+
// We want unique document indentifiers, using std::set to collect them
std::set<DocId> resultSet;
+ ulong sampled_rank_i = 0;
// Check each occurrence
for (; sp <= ep; ++sp)
{
TextPosition i = sp;
- uchar c = alphabetrank->charAtPos(i);
- while (c != '\0' && !sampled->IsBitSet(i))
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
{
i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->charAtPos(i);
+ c = alphabetrank->access(i);
}
if (c == '\0')
{
}
else
{
- TextPosition textPos = (*suffixes)[sampled->rank(i)-1];
- resultSet.insert(DocIdAtTextPos(textPos));
+ DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+ assert((unsigned)di < numberOfTexts);
+ resultSet.insert(di);
}
}
- // Convert std::set to std::vector (TODO better way to construct result vector?)
- TextCollection::document_result result;
- for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
- result.push_back(*it);
+ // Convert std::set to std::vector
+ TextCollection::document_result result(resultSet.begin(), resultSet.end());
+ // Map to doc ID's
+ for (document_result::iterator it = result.begin(); it != result.end(); ++it)
+ *it = emptyTextRank->select0(*it+1);
return result;
}
{
TextPosition m = strlen((char *)pattern);
if (m == 0)
- return full_result();
+ return full_result(); // FIXME Throw exception?
TextPosition sp = 0, ep = 0;
// Search all occurrences
Search(pattern, m, &sp, &ep);
full_result result;
+ result.reserve(ep-sp+1); // Try to avoid reallocation.
+ ulong sampled_rank_i = 0;
// Report each occurrence
for (; sp <= ep; ++sp)
{
TextPosition i = sp;
TextPosition dist = 0;
- uchar c = alphabetrank->charAtPos(i);
- while (c != '\0' && !sampled->IsBitSet(i))
+ uchar c = alphabetrank->access(i);
+ while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i))
{
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
- c = alphabetrank->charAtPos(i);
+ i = C[c]+alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i);
++ dist;
}
if (c == '\0')
// End-marker that we found belongs to the "preceeding" doc in collection:
DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts;
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
result.push_back(make_pair(docId, dist));
}
else
{
- TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist;
- // Read document identifier and its starting position in text order
- DocId docId = DocIdAtTextPos(textPos);
- result.push_back(make_pair(docId, textPos - (*textStartPos)[docId]));
+ TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1];
+// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text
+
+ // Map to doc ID:
+ docId = emptyTextRank->select0(docId+1);
+ result.push_back(make_pair(docId, textPos));
}
}
}
-/**
- * Finds document identifier for given text position
- *
- * Starting text position of the document is stored into second parameter.
- * Binary searching on text starting positions.
- */
-TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
-{
- assert(i < n);
-
- DocId a = 0;
- DocId b = numberOfTexts - 1;
- while (a < b)
- {
- DocId c = a + (b - a)/2;
- if ((*textStartPos)[c] > i)
- b = c - 1;
- else if ((*textStartPos)[c+1] > i)
- return c;
- else
- a = c + 1;
- }
-
- assert(a < (DocId)numberOfTexts);
- assert(i > (*textStartPos)[a]);
- assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
- return a;
-}
-
-/**
- * Count end-markers in given interval
- */
-unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const
-{
- if (sp > ep)
- return 0;
-
- ulong ranksp = 0;
- if (sp != 0)
- ranksp = alphabetrank->rank(0, sp - 1);
-
- return alphabetrank->rank(0, ep) - ranksp;
-}
-
/**
* Save index to a file handle
*
* Throws a std::runtime_error exception on i/o error.
* First byte that is saved represents the version number of the save file.
- * In version 1 files, the data fields are:
- * <uchar version> version info;
- * <unsigned s> samplerate;
- * <ulong n> length of the BWT;
- * <ulong bwt> end marker position in BWT;
- * <uchar *> BWT string of length n;
- * <unsigned r> number of texts;
- * <ulong max> length of longest text;
- * <vector textLength>
- * array of <ulong, ulong> pairs.
- *
- * TODO: Save the data structures instead of BWT sequence?
- * TODO: Don't save textLength and textStartPos arrays.
+ * In version 2 files, the data fields are:
+ * uchar versionFlag;
+ TextPosition n;
+ unsigned samplerate;
+ unsigned C[256];
+ TextPosition bwtEndPos;
+ static_sequence * alphabetrank;
+ BSGAP *sampled;
+ BlockArray *suffixes;
+ BlockArray *suffixDocId;
+ unsigned numberOfTexts;
+ unsigned numberOfAllTexts;
+ ulong maxTextLength;
+ BlockArray *endmarkerDocId;
+ BSGAP *emptyTextRank;
*/
void CSA::Save(FILE *file) const
{
- // Saving version 1 data:
- uchar versionFlag = 1;
+ // Saving version info:
if (std::fwrite(&versionFlag, 1, 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (version flag).");
+ if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("CSA::Save(): file write error (n).");
if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (samplerate).");
- if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (n).");
+ for(ulong i = 0; i < 256; ++i)
+ if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("CSA::Save(): file write error (C table).");
if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (bwt end position).");
-
- for (ulong offset = 0; offset < n; offset ++)
- {
- uchar c = alphabetrank->charAtPos(offset);
- if (std::fwrite(&c, sizeof(uchar), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (bwt sequence).");
- }
- unsigned r = numberOfTexts;
- if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (r).");
+ alphabetrank->save(file);
+ sampled->Save(file);
+ suffixes->Save(file);
+ suffixDocId->Save(file);
+ if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("CSA::Save(): file write error (numberOfTexts).");
+ if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts).");
if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
throw std::runtime_error("CSA::Save(): file write error (maxTextLength).");
-
- for (r = 0; r < numberOfTexts; ++ r)
- {
- if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (text length).");
- if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Save(): file write error (text start).");
- }
+
+ endmarkerDocId->Save(file);
+ emptyTextRank->Save(file);
}
delete dynFMI; dynFMI = 0;
delete alphabetrank; alphabetrank = 0;
delete sampled; sampled = 0;
- delete [] suffixes; suffixes = 0;
- delete [] positions; positions = 0;
+ delete suffixes; suffixes = 0;
+ delete suffixDocId; suffixDocId = 0;
+ delete positions; positions = 0;
delete [] codetable; codetable = 0;
delete endmarkerDocId; endmarkerDocId = 0;
+ delete emptyTextRank; emptyTextRank = 0;
+ // FIXME Remove following:
delete textLength; textLength = 0;
delete textStartPos; textStartPos = 0;
this->maxTextLength = 0;
this->numberOfTexts = 0;
+ this->numberOfAllTexts = 0;
this->samplerate = samplerate;
this->n = 0;
- uchar versionFlag = 0;
- if (std::fread(&versionFlag, 1, 1, file) != 1)
+ uchar verFlag = 0;
+ if (std::fread(&verFlag, 1, 1, file) != 1)
throw std::runtime_error("CSA::Load(): file read error (version flag).");
- if (versionFlag != 1)
- throw std::runtime_error("CSA::Load(): invalid start byte.");
+ if (verFlag != CSA::versionFlag)
+ throw std::runtime_error("CSA::Load(): invalid save file version.");
+ if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
+ throw std::runtime_error("CSA::Load(): file read error (n).");
if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1)
throw std::runtime_error("CSA::Load(): file read error (samplerate).");
- if (this->samplerate == 0)
- this->samplerate = samplerate;
+// FIXME samplerate can not be changed during load.
+// if (this->samplerate == 0)
+// this->samplerate = samplerate;
- if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (n).");
+ for(ulong i = 0; i < 256; ++i)
+ if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("CSA::Load(): file read error (C table).");
if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1)
throw std::runtime_error("CSA::Load(): file read error (bwt end position).");
- uchar *bwt = new uchar[n];
- for (ulong offset = 0; offset < n; offset ++)
- if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (bwt sequence).");
-
- unsigned r = 0;
- if (std::fread(&r, sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (r).");
+ alphabetrank = static_sequence::load(file);
+ sampled = new BSGAP(file);
+ suffixes = new BlockArray(file);
+ suffixDocId = new BlockArray(file);
+ if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("CSA::Load(): file read error (numberOfTexts).");
+ if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1)
+ throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts).");
if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1)
throw std::runtime_error("CSA::Load(): file read error (maxTextLength).");
-
- // Build up arrays for text length and starting positions
- textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength));
- textStartPos = new BlockArray(r, Tools::CeilLog2(this->n));
- while (r > 0)
- {
- TextPosition length = 0, start = 0;
- if (std::fread(&length, sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (text length).");
- if (std::fread(&start, sizeof(TextPosition), 1, file) != 1)
- throw std::runtime_error("CSA::Load(): file read error (text start).");
-
- (*textLength)[numberOfTexts] = length;
- (*textStartPos)[numberOfTexts] = start;
- ++numberOfTexts;
- --r;
- }
+ endmarkerDocId = new BlockArray(file);
+ emptyTextRank = new BSGAP(file);
- // Construct data structures
- makewavelet(bwt);
- delete [] bwt;
- maketables(); // FIXME: this will redo text length tables
+ // FIXME Construct data structures with new samplerate
+ //maketables(); // FIXME: this will redo text length tables
}
* Rest of the functions follow...
*/
+/*
+ * Not supported
uchar * CSA::Substring(TextPosition i, TextPosition l) const
{
uchar *result = new uchar[l + 1];
for (dist = 0; dist < skip + l; dist++)
{
- int c = alphabetrank->charAtPos(j);
+ int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp);
if (c == '\0')
{
// Rank among the end-markers in BWT
result[l] = 0u;
return result;
}
-
+*/
/*TextPosition CSA::inverse(TextPosition i)
{
TextPosition skip = samplerate - i % samplerate;
return j;
}*/
-ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const
+{
// use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
int c = (int)pattern[m-1];
TextPosition i=m-1;
return ep - sp + 1;
else
return 0;
- }
+}
/*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
{
return 0;
}*/
-CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
+CSA::TextPosition CSA::Lookup(TextPosition i) const
{
TextPosition dist=0;
while (!sampled->IsBitSet(i))
{
- int c = alphabetrank->charAtPos(i);
+
+ int c = alphabetrank->access(i);
if (c == '\0')
{
// End-markers are sampled
DocId docId = (*endmarkerDocId)[endmarkerRank];
return (*textStartPos)[docId] + dist;
}
- i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ i = C[c]+alphabetrank->rank(c,i)-1;
++dist;
}
delete alphabetrank;
delete sampled;
delete suffixes;
+ delete suffixDocId;
delete positions;
delete [] codetable;
delete endmarkerDocId;
delete textLength;
delete textStartPos;
+ delete emptyTextRank;
}
void CSA::makewavelet(uchar *bwt)
C[i]=C[i-1]+prev;
prev = temp;
}
- this->codetable = node::makecodetable(bwt,n);
- alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
- //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
+// this->codetable = node::makecodetable(bwt,n);
+// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
+// delete [] bwt;
+ //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt!
+
+ uint *text = new uint[n];
+ for (i = 0; i < n; ++i) // Silly
+ text[i] = bwt[i];
+ delete [] bwt;
+ bwt = 0;
+
+ alphabet_mapper * am = new alphabet_mapper_none();
+ static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(16); // FIXME samplerate?
+
+ //cout << "Building Huffman table..."; cout.flush();
+
+ wt_coder * wtc = new wt_coder_huff(text,n,am);
+
+ //cout << "done" << endl; cout.flush();
+ //cout << "Building static_sequence..."; cout.flush();
+
+ alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am);
+ delete [] text;
+ text = 0;
/* for (i = 0; i < n; ++i)
{
ulong *sampledpositions = new ulong[n/W+1];
unsigned ceilLog2n = Tools::CeilLog2(n);
- suffixes = new BlockArray(sampleLength, ceilLog2n);
positions = new BlockArray(sampleLength, ceilLog2n);
-
+ BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n);
+
// Mapping from end-markers to doc ID's:
endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts));
sampledpositions[i]=0lu;
ulong x,p=bwtEndPos;
+ ulong sampleCount = 0;
// Keeping track of text position of end-markers seen
ulong posOfSuccEndmarker = n;
DocId textId = numberOfTexts;
// i substitutes SA->GetPos(i)
x=(i==n-1)?0:i+1;
- if (x % samplerate == 0) {
+ if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) {
Tools::SetField(sampledpositions,1,p,1);
- (*positions)[x/samplerate] = p;
+ (*positions)[sampleCount] = p;
+ (*tmpSuffix)[sampleCount] = x; // FIXME remove
+ sampleCount ++;
}
- uchar c = alphabetrank->charAtPos(p);
+ uchar c = alphabetrank->access(p);
if (c == '\0')
{
--textId;
assert(dynTextLength[i].second == (*textStartPos)[i]);
}
*/
- sampled = new BitRank(sampledpositions,n,true);
-
+ sampled = new BSGAP(sampledpositions,n,true);
+ sampleLength = sampled->rank(n-1);
+ assert(sampleCount == sampleLength);
+// std::cout << ";sampleLength;" << sampleLength << std::endl;
+ // Suffixes == offset from text start position
+ suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength));
+ suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts));
+
//suffixes:
for(i=0; i<sampleLength; i++) {
+ assert((*positions)[i] < n);
j = sampled->rank((*positions)[i]);
if (j==0) j=sampleLength;
- (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate;
- }
+ TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate;
+ (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]);
+
+ assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts);
+ assert((*suffixDocId)[j-1] < numberOfTexts);
+ // calculate offset from text start:
+ (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]];
+ }
+ // FIXME Temp, remove
+ delete tmpSuffix;
+ tmpSuffix=0;
+ delete positions;
+ positions =0;
+ delete textLength;
+ textLength = 0;
+ delete textStartPos;
+ textStartPos = 0;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions.
+ */
+TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
+{
+ assert(i < n);
+
+ DocId a = 0;
+ DocId b = numberOfTexts - 1;
+ while (a < b)
+ {
+ DocId c = a + (b - a)/2;
+ if ((*textStartPos)[c] > i)
+ b = c - 1;
+ else if ((*textStartPos)[c+1] > i)
+ return c;
+ else
+ a = c + 1;
+ }
+
+ assert(a < (DocId)numberOfTexts);
+ assert(i >= (*textStartPos)[a]);
+ assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1]));
+ return a;
}
CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
#ifndef _CSA_H_
#define _CSA_H_
+#include "dynFMI.h"
+
+// Include from XMLTree/libcds
+#include <basics.h>
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
-#include <map>
+#include <set>
#include <vector>
#include "BitRank.h"
-#include "dynFMI.h"
#include "TextCollection.h"
#include "BlockArray.h"
+#include "RLWaveletTree.h"
/**
* Implementation of the TextCollection interface
void MakeStatic();
bool EmptyText(DocId) const;
uchar* GetText(DocId) const;
- uchar* GetText(DocId, TextPosition, TextPosition) const;
+// uchar* GetText(DocId, TextPosition, TextPosition) const;
bool IsPrefix(uchar const *) const;
bool IsSuffix(uchar const *) const;
void Load(FILE *, unsigned);
void Save(FILE *) const;
- // debug
- TextPosition Lookup(TextPosition) const;
+
+ // Debug FIXME Remove
+ void deleteWT()
+ {
+ delete alphabetrank;
+ alphabetrank = 0;
+ delete [] codetable;
+ codetable = 0;
+ }
+ void deleteSamples()
+ {
+ delete sampled;
+ sampled =0;
+ delete suffixes;
+ suffixes = 0;
+ delete positions;
+ positions = 0;
+ delete suffixDocId;
+ suffixDocId = 0;
+ }
+ void deleteEndmarker()
+ {
+ delete endmarkerDocId;
+ endmarkerDocId = 0;
+ }
private:
class TCodeEntry {
bool leaf;
public:
THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+ THuffAlphabetRank(FILE *);
~THuffAlphabetRank();
+
+ void Save(FILE *);
bool Test(uchar *, TextPosition);
inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
}
return true;
}
- inline int charAtPos(TextPosition i) const {
+ inline uchar access(TextPosition i) const {
THuffAlphabetRank const * temp=this;
while (!temp->leaf) {
if (temp->bitrank->IsBitSet(i)) {
temp = temp->left;
}
}
- return (int)temp->ch;
+ return temp->ch;
+ }
+
+ inline uchar charAtPos(ulong i, TextPosition *rank) const {
+ THuffAlphabetRank const * temp=this;
+ while (!temp->leaf) {
+ if (temp->bitrank->IsBitSet(i)) {
+ i = temp->bitrank->rank(i)-1;
+ temp = temp->right;
+ } else {
+ i = i-temp->bitrank->rank(i);
+ temp = temp->left;
+ }
+ }
+ (*rank)=i;
+ return temp->ch;
}
};
static const unsigned char print = 1;
static const unsigned char report = 1;
+ static const uchar versionFlag;
TextPosition n;
unsigned samplerate;
unsigned C[256];
TextPosition bwtEndPos;
- THuffAlphabetRank *alphabetrank;
- BitRank *sampled;
+// THuffAlphabetRank *alphabetrank;
+ // RLWaveletTree *alphabetrank;
+ static_sequence * alphabetrank;
+ BSGAP *sampled;
BlockArray *suffixes;
+ BlockArray *suffixDocId;
BlockArray *positions;
TCodeEntry *codetable;
DynFMI *dynFMI;
// Total number of texts in the collection
unsigned numberOfTexts;
+ // Total number of texts including empty texts
+ unsigned numberOfAllTexts;
// Length of the longest text
ulong maxTextLength;
-
-
+
// Array of document id's in the order of end-markers in BWT
// Access by endmarkerDocId[rank_$(L, p) - 1].
BlockArray *endmarkerDocId;
BlockArray *textLength;
// Array of text starting positions (in the inserted order)
BlockArray *textStartPos;
-
+
+ // FIXME Replace with a more succinct data structure
+ std::set<unsigned> emptyTextId;
+ BSGAP *emptyTextRank;
+
// Private methods
uchar * BWT(uchar *);
uchar * LoadFromFile(const char *);
// Following are not part of the public API
DocId DocIdAtTextPos(TextPosition) const;
- unsigned CountEndmarkers(TextPosition, TextPosition) const;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
TextPosition Inverse(TextPosition) const;
TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
TextPosition Psi(TextPosition) const;
uchar * Substring(TextPosition, TextPosition) const;
+ TextPosition Lookup(TextPosition) const;
+
+ /**
+ * Count end-markers in given interval
+ */
+ inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+ {
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ return alphabetrank->rank(0, ep) - ranksp;
+ }
};
#endif
--- /dev/null
+/**
+ * Gap encoding scheme
+ * Niko Välimäki
+ *
+ */
+
+#include "GapEncode.h"
+#include <stdexcept>
+using namespace std;
+
+
+GapEncode::GapEncode(ulong *B, ulong u, bool freeB = false)
+{
+ ulong i, lastIndex, *bitRun, *runLength;
+ this->u = u;
+ this->numberOfRuns = 0;
+ bitRun = new ulong[u/W + 1];
+ for (i = 0; i < u/W + 1; i ++)
+ bitRun[i] = 0;
+
+ // Collect the runs of 1- or 0-bits
+ totalLength = 0;
+ uchar curBit = 0xFF;
+ lastIndex = 0;
+ for (i = 0; i < u; i ++)
+ if (Tools::GetField(B, 1, i) != curBit)
+ {
+ if (curBit == 1)
+ totalLength += i - lastIndex;
+ else if (Tools::GetField(B, 1, i))
+ Tools::SetField(bitRun, 1, i, 1);
+
+ lastIndex = i;
+ curBit = Tools::GetField(B, 1, i);
+ }
+ // Handle the last run
+ if (curBit == 1)
+ totalLength += i - lastIndex;
+
+ runLength = new ulong[totalLength/W + 1];
+ for (i = 0; i < totalLength/W + 1; i ++)
+ runLength[i] = 0;
+
+ // Calculate run lengths
+ totalLength = 0;
+ curBit = 0xFF;
+ lastIndex = 0;
+ for (i = 0; i < u; i ++)
+ if (Tools::GetField(B, 1, i) != curBit)
+ {
+ if (curBit == 1)
+ {
+ totalLength += i - lastIndex;
+ Tools::SetField(runLength, 1, totalLength - 1, 1);
+ }
+ lastIndex = i;
+ curBit = Tools::GetField(B, 1, i);
+ }
+ // Handle the last run
+ if (curBit == 1)
+ {
+ totalLength += i - lastIndex;
+ Tools::SetField(runLength, 1, totalLength - 1, 1);
+ }
+
+ // Construct BSGAP structures
+ this->bitRun = new BSGAP(bitRun, u, true, 0);
+ numberOfRuns = this->bitRun->rank(u-1);
+ this->runLength = new BSGAP(runLength, totalLength, true, 0);
+
+ if (freeB)
+ delete [] B;
+}
+
+GapEncode::~GapEncode()
+{
+ // Free BSGAP structures
+ delete bitRun;
+ delete runLength;
+}
+
+ulong GapEncode::rank(ulong i)
+{
+ ulong pred, r = bitRun->rank(i);
+ if (r == 0)
+ return 0;
+ pred = bitRun->select(r);
+
+ ulong bitsBeforeRun = 0;
+ if (r > 1)
+ bitsBeforeRun = runLength->select(r - 1) + 1;
+
+ ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+ if (i - pred >= bitsInRun)
+ return bitsBeforeRun + bitsInRun;
+
+ return bitsBeforeRun + i - pred + 1;
+}
+
+ulong GapEncode::rank0(ulong i)
+{
+ return i - rank(i) + 1;
+}
+
+ulong GapEncode::select(ulong i)
+{
+ throw logic_error("GapEncode::select() is not implemented");
+ return 0;
+}
+
+ulong GapEncode::select0(ulong i)
+{
+ throw logic_error("GapEncode::select0() is not implemented");
+ return 0; // To be implemented...
+}
+
+/**
+ * Saving data fields:
+ ulong u; // Universe size, number of bits in B
+ ulong numberOfRuns; // Number of 1-bit runs
+ BSGAP *bitRun, // Contains 1-bit at the start of each 1-bit-run
+ *runLength; // Contains run lengths for each 1-bit-run
+ ulong totalLength; // Total length of runs
+*/
+void GapEncode::Save(FILE *file) const
+{
+ if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("GapEncode::Save(): file write error (u).");
+ if (std::fwrite(&(this->numberOfRuns), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("GapEncode::Save(): file write error (numberOfRuns).");
+
+ bitRun->Save(file);
+ runLength->Save(file);
+
+ if (std::fwrite(&(this->totalLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("GapEncode::Save(): file write error (totalLength).");
+}
+
+GapEncode::GapEncode(FILE *file)
+{
+ if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("GapEncode::Load(): file read error (u).");
+ if (std::fread(&(this->numberOfRuns), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("GapEncode::Load(): file read error (numberOfRuns).");
+
+ bitRun = new BSGAP(file);
+ runLength = new BSGAP(file);
+
+ if (std::fread(&(this->totalLength), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("GapEncode::Load(): file read error (totalLength).");
+}
--- /dev/null
+/**
+ * Gap encoding scheme
+ * Niko Välimäki
+ *
+ *
+ */
+
+#ifndef _GapEncode_H_
+#define _GapEncode_H_
+
+#include "Tools.h"
+#include "BlockArray.h"
+#include "BSGAP.h"
+
+class GapEncode
+{
+public:
+ GapEncode(ulong *, ulong, bool);
+ GapEncode(FILE *);
+ ~GapEncode();
+
+ void Save(FILE *) const;
+ ulong rank(ulong);
+ ulong rankrun(ulong i) {return bitRun->rank(i);};
+ ulong selectrun(ulong i) {return bitRun->select(i);};
+ ulong rank0(ulong);
+ ulong select(ulong);
+ ulong select0(ulong); // Not implemented, yet...
+ bool IsBitSet(ulong i) {
+ ulong r = bitRun->rank(i);
+ if (r == 0)
+ return false;
+ ulong pred = bitRun->select(r);
+
+ ulong bitsBeforeRun = 0;
+ if (r > 1)
+ bitsBeforeRun = runLength->select(r - 1) + 1;
+
+ ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+ if (i - pred >= bitsInRun)
+ return false;
+
+ return true;
+ }
+
+ /**
+ * Returns also the rank_1 of given position i
+ */
+ bool IsBitSet(ulong i, ulong *rank) {
+ *rank = 0;
+ ulong r = bitRun->rank(i);
+ if (r == 0)
+ return false;
+ ulong pred = bitRun->select(r);
+
+ ulong bitsBeforeRun = 0;
+ if (r > 1)
+ bitsBeforeRun = runLength->select(r - 1) + 1;
+
+ ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+ if (i - pred >= bitsInRun)
+ {
+ *rank = bitsBeforeRun + bitsInRun;
+ return false;
+ }
+
+ *rank = bitsBeforeRun + i - pred + 1;
+ return true;
+ }
+
+ /**
+ * Returns also the rank_1 of given position i
+ * and the length of the trailing 0/1-run.
+ */
+ bool IsBitSet(ulong i, ulong *rank, ulong *runl) {
+ *rank = 0;
+ ulong r = bitRun->rank(i);
+ if (r == 0)
+ {
+ // Length of 0-bit run after pos i
+ *runl = bitRun->select(1) - i - 1;
+ return false;
+ }
+ ulong pred = bitRun->select(r);
+
+ ulong bitsBeforeRun = 0;
+ if (r > 1)
+ bitsBeforeRun = runLength->select(r - 1) + 1;
+
+ ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun;
+ if (i - pred >= bitsInRun)
+ {
+ *rank = bitsBeforeRun + bitsInRun;
+ *runl = u - i - 1;
+ if (r < numberOfRuns)
+ *runl = bitRun->select(r+1) - i - 1;
+ return false;
+ }
+
+ *rank = bitsBeforeRun + i - pred + 1;
+ *runl = bitsInRun - (i - pred) - 1;
+ return true;
+ }
+
+private:
+ ulong u; // Universe size, number of bits in B
+ ulong numberOfRuns; // Number of 1-bit runs
+ BSGAP *bitRun, // Contains 1-bit at the start of each 1-bit-run
+ *runLength; // Contains run lengths for each 1-bit-run
+ ulong totalLength; // Total length of runs
+};
+
+#endif
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2007 by Veli Mäkinen *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+#include "RLWaveletTree.h"
+using namespace std;
+
+void RLWaveletTree::remapAlphabet(uchar *text, ulong n)
+{
+ ulong *chars = new ulong[256/W+1];
+ ulong i;
+ for (i = 0; i < 256/W+1; ++i)
+ chars[i] = 0;
+ // Set up flags:
+ for (i = 0; i < n; ++i)
+ Tools::SetField(chars, 1, text[i], 1);
+ charMap = new BitRank(chars, 256, true);
+
+ // Remap text
+ for (i = 0; i < n; ++i)
+ text[i] = charMap->rank(text[i])-1;
+}
+
+RLWaveletTree::RLWaveletTree(uchar *D, ulong n) {
+ remapAlphabet(D, n);
+// std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl;
+
+ this->n = n;
+ this->maxlevel = Tools::CeilLog2(charMap->rank(255));
+
+ ulong i, k, value;
+ ulong** bucket = new ulong*[maxlevel+1];
+ ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1];
+ for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++)
+ wtree[i] = 0lu;
+
+ // allocate memory for storing the locations of boudaries
+ for (k=0; k<=maxlevel; k++) {
+ bucket[k] = (ulong *)new ulong[(ulong)1 << k];
+ for (i=0; i<((ulong)1 << k); i++)
+ bucket[k][i]=0;
+ }
+
+ // compute the sizes of buckets
+ for (i=0; i<n; i++) {
+ value = D[i];
+ for (k=0; k<=maxlevel; k++)
+ bucket[k][value >> (maxlevel-k)]++;
+ }
+
+ // cumulate the bucket sizes into boundary locations
+ for (k=0; k<=maxlevel; k++) {
+ for (i=1; i<((ulong)1 << k); i++)
+ bucket[k][i]+=bucket[k][i-1];
+ for (i=((ulong)1 << k)-1; i>=1; i--)
+ bucket[k][i]=bucket[k][i-1];
+ bucket[k][0]=0;
+ }
+
+ //store the leaf information
+ leaves = new ulong[((ulong)1 << maxlevel)+1];
+ for (k=0; k<(ulong)1<<maxlevel; k++)
+ leaves[k] = bucket[maxlevel][k];
+ leaves[k]=n;
+
+ // compute wtree using the information about boundaries
+ for (i=0; i<n; i++) {
+ value = D[i];
+ for (k=0; k<maxlevel; k++)
+ Tools::SetField(wtree,1,k*n+(bucket[k][value >> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1);
+ }
+ delete [] D;
+ // deallocate boundaries
+ for (k=0; k<=maxlevel; k++)
+ delete[] bucket[k];
+ delete[] bucket;
+
+ bitrank = new GapEncode(wtree, n*maxlevel, true);
+ prevRunLen = 0;
+ prevChar = 0;
+ prevPos = 0;
+}
+
+RLWaveletTree::~RLWaveletTree() {
+ delete charMap;
+ delete [] leaves;
+ delete bitrank;
+}
+
+ulong RLWaveletTree::rank(ulong c, ulong i) {
+// if (i >= n) FIXME Replace with assert?
+// std::cout << "RLWT::rank parameter i out of range" << std::endl;
+ ulong k;
+ ulong j=i;
+ ulong left = 0, right = (ulong)1 << maxlevel;
+ ulong before=0;
+ c = charMap->rank(c)-1;
+
+ // first round separately to avoid extra "if"
+ if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right
+ j = bitrank->rank(j)-1;
+ left = right>>1;
+ }
+ else { // go to the left
+ j -= bitrank->rank(j);
+ right = right>>1;
+ }
+ // then the next without the extra if
+ for (k=1;k<maxlevel;k++) {
+ before = bitrank->rank(n*k+leaves[left]-1);
+ if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right
+ j = bitrank->rank(n*k+leaves[left]+j)-before-1;
+ left = (right+left)>>1;
+ }
+ else { // go to the left
+ j -= bitrank->rank(n*k+leaves[left]+j)-before;
+ right = (right+left)>>1;
+ }
+ // can never happen: if (left > right) return 0;
+ }
+ return j+1;
+}
+
+ulong RLWaveletTree::select(ulong c, ulong j) {
+ c = charMap->rank(c)-1;
+ ulong left=c, right=c+1;
+ ulong k, i=j-1, before;
+ for (k=maxlevel-1; k>0;--k) {
+ if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent
+ left -= right-left;
+ before = bitrank->rank(n*k+leaves[left]-1);
+ i = bitrank->select(before+i+1)-n*k-leaves[left];
+ }
+ else { // left side of parent
+ right +=right-left;
+ before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1);
+ i = bitrank->select0(before+i+1)-n*k-leaves[left];
+ }
+ }
+ // last level separately to avoid one extra if
+ if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent
+ return bitrank->select(i+1);
+ else // left side of parent
+ return bitrank->select0(i+1);
+}
+
+bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) {
+ return charAtPos(i)==c;
+}
+
+
+ulong RLWaveletTree::charAtPos(ulong i)
+{
+// if (i >= n) // FIXME Replace with assert?
+// std::cout << "RLWT::charAtPos parameter i out of range" << std::endl;
+ ulong k;
+ ulong j=i;
+ ulong left = 0, right = (ulong)1 << maxlevel;
+ ulong before=0;
+ ulong c = 0;
+ ulong rank_tmp = 0;
+ // first round separately to avoid extra "if"
+ if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right
+ c = c | ((ulong)1 << (maxlevel-1));
+ j = rank_tmp-1; //bitrank->rank(j)-1;
+ left = right>>1;
+ }
+ else { // go to the left
+ // c = c
+ j -= rank_tmp; // bitrank->rank(j);
+ right = right>>1;
+ }
+ // then the next without the extra if
+ for (k=1;k<maxlevel;k++) {
+ before = bitrank->rank(n*k+leaves[left]-1);
+ if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
+ c = c | ((ulong)1 << (maxlevel-k-1));
+ j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+ left = (right+left)>>1;
+ }
+ else { // go to the left
+ // c = c
+ j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+ right = (right+left)>>1;
+ }
+ // can never happen: if (left > right) return 0;
+ }
+ return charMap->select(c+1);
+}
+
+/* returns also the rank-1 of char at given position i */
+ulong RLWaveletTree::charAtPos(ulong i, ulong *rank)
+{
+// if (i >= n) // FIXME Replace with assert?
+// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
+ ulong k;
+ ulong j=i;
+ ulong left = 0, right = (ulong)1 << maxlevel;
+ ulong before=0;
+ ulong c = 0;
+ ulong rank_tmp = 0;
+ // first round separately to avoid extra "if"
+ if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ?
+ c = c | ((ulong)1 << (maxlevel-1));
+ j = rank_tmp-1; //bitrank->rank(j)-1;
+ left = right>>1;
+ }
+ else { // go to the left
+ // c = c
+ j -= rank_tmp; //bitrank->rank(j);
+ right = right>>1;
+ }
+ // then the next without the extra if
+ for (k=1;k<maxlevel;k++) {
+ before = bitrank->rank(n*k+leaves[left]-1);
+ if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right
+ c = c | ((ulong)1 << (maxlevel-k-1));
+ j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+ left = (right+left)>>1;
+ }
+ else { // go to the left
+ // c = c
+ j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+ right = (right+left)>>1;
+ }
+ // can never happen: if (left > right) return 0;
+ }
+ *rank = j;
+ return charMap->select(c+1);
+}
+
+/* returns also the rank-1 of char at given position i
+ * and trailing char-run length */
+ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank)
+{
+ // Check for buffered data from previous call
+ if (i > prevPos && i <= prevPos + prevRunLen)
+ {
+ *rank = prevRank + (i-prevPos);
+ return prevChar;
+ }
+
+ prevPos = i;
+// if (i >= n) // FIXME Replace with assert?
+// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl;
+ ulong k;
+ ulong j=i;
+ ulong left = 0, right = (ulong)1 << maxlevel;
+ ulong before=0;
+ ulong c = 0;
+ ulong rank_tmp = 0;
+ ulong runl_tmp = 0;
+ // first round separately to avoid extra "if"
+ if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) {
+ if (j+prevRunLen > n)
+ prevRunLen = n-j-1;
+ c = c | ((ulong)1 << (maxlevel-1));
+ j = rank_tmp-1; //bitrank->rank(j)-1;
+ left = right>>1;
+ }
+ else { // go to the left
+ // c = c
+ if (j+prevRunLen > n)
+ prevRunLen = n-j-1;
+ j -= rank_tmp; //bitrank->rank(j);
+ right = right>>1;
+ }
+
+ // then the next without the extra if
+ for (k=1;k<maxlevel;k++) {
+ before = bitrank->rank(n*k+leaves[left]-1);
+ if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right
+ if (leaves[left]+j+runl_tmp > leaves[right])
+ runl_tmp = leaves[right] - (leaves[left]+j) - 1;
+ if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
+ prevRunLen = runl_tmp;
+ c = c | ((ulong)1 << (maxlevel-k-1));
+ j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1;
+ left = (right+left)>>1;
+ }
+ else { // go to the left
+ if (leaves[left]+j+runl_tmp > leaves[right])
+ runl_tmp = leaves[right] - (leaves[left]+j) - 1;
+ if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run?
+ prevRunLen = runl_tmp;
+ // c = c
+ j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before;
+ right = (right+left)>>1;
+ }
+ // can never happen: if (left > right) return 0;
+ }
+ *rank = j;
+ prevRank = j;
+ prevChar = charMap->select(c+1);
+ return charMap->select(c+1);
+}
+
+/**
+ * Saving data fields:
+ ulong n;
+ ulong maxlevel;
+ ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
+ GapEncode *bitrank;
+ BitRank *charMap;
+*/
+void RLWaveletTree::Save(FILE *file) const
+{
+ if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Save(): file write error (n).");
+ if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel).");
+
+ for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
+ {
+ if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves).");
+ }
+
+ bitrank->Save(file);
+
+ // Silly
+ ulong *chars = new ulong[256/W+1];
+ // Set up flags:
+ for (ulong i = 0; i < 256; ++i)
+ Tools::SetField(chars, 1, i, charMap->IsBitSet(i));
+
+ for (ulong offset = 0; offset < 256/W+1; ++offset)
+ {
+ if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Save(): file write error (chars).");
+ }
+
+ delete [] chars;
+}
+
+/**
+ * Load from file
+ */
+RLWaveletTree::RLWaveletTree(FILE *file)
+{
+ if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Load(): file read error (n).");
+ if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel).");
+
+ for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset)
+ {
+ if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves).");
+ }
+
+ bitrank = new GapEncode(file);
+
+ // Silly
+ ulong *chars = new ulong[256/W+1];
+ // Read flags
+ for (ulong offset = 0; offset < 256/W+1; ++offset)
+ {
+ if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1)
+ throw std::runtime_error("RLWaveletTree::Load(): file read error (chars).");
+ }
+ charMap = new BitRank(chars, 256, true);
+
+ prevRunLen = 0; // Init "buffer" values
+ prevChar = 0;
+ prevPos = 0;
+}
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2007 by Veli Mäkinen *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ ***************************************************************************/
+
+/*
+ * Wavelet tree by Veli Mäkinen,
+ */
+
+#ifndef _RLWAVELETTREE_H_
+#define _RLWAVELETTREE_H_
+#include "GapEncode.h"
+#include "Tools.h"
+#include "BitRank.h"
+
+class RLWaveletTree {
+private:
+ ulong n;
+ ulong maxlevel;
+ ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1
+ GapEncode *bitrank;
+ BitRank *charMap;
+ void remapAlphabet(uchar*, ulong);
+
+ ulong prevRunLen; // Used by charAtPosRun()
+ ulong prevPos;
+ ulong prevChar;
+ ulong prevRank;
+public:
+ RLWaveletTree(uchar *, ulong);
+ RLWaveletTree(FILE *);
+ ~RLWaveletTree();
+
+ void Save(FILE *) const;
+ ulong rank(ulong c, ulong i);
+
+ ulong select(ulong c, ulong j);
+
+ bool IsCharAtPos(ulong c, ulong i);
+
+ ulong charAtPos(ulong i);
+ ulong charAtPos(ulong i, ulong *rank);
+ ulong charAtPosRun(ulong i, ulong *rank);
+ double spaceInBits() {
+ return 0; // Not implemented
+ }
+};
+
+#endif
*
* Note: Parameters i and j are text positions inside the k'th text.
*/
- virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
+// virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
/**
* Returns backwards (reverse) iterator to the end of i'th text
*
typedef std::vector<std::pair<DocId, TextPosition> > full_result;
virtual full_result FullContains(uchar const *) const = 0;
+
+ /**
+ *Debug
+ *
+ */
+ virtual TextPosition Lookup(TextPosition) const = 0;
protected:
// Protected constructor; call the static function InitTextCollection().
TextCollection() { };
# define W 32
#endif
+#ifndef WW
#define WW (W*2)
+#endif
+
+#ifndef Wminusone
#define Wminusone (W-1)
+#endif
#ifndef uchar
#define uchar unsigned char
CC = g++
-CPPFLAGS = -Wall -ansi -pedantic -g -O2 -DNDEBUG
+LIBCDSPATH = ../libcds
+CPPFLAGS = -Wall -ansi -g -O2 -I$(LIBCDSPATH)/includes/ -DNDEBUG
+LIBCDSA = $(LIBCDSPATH)/lib/libcds.a
-testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o
+testTextCollection_obs = testTextCollection5.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA}
all: $(testTextCollection_obs)