+++ /dev/null
-/**
- * Binary-searchable gap encoding scheme (BSGAP)
- * Niko Välimäki
- *
- * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp
- *
- * Based on:
- *
- * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
- * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
- */
-
-#include "BSGAP.h"
-#include <stdexcept>
-#include <cassert>
-using namespace std;
-
-
-const uchar BSGAP::bit_table[] = {
- 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,
- 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,
- 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,
- 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,
- 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,
- 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,
- 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1};
-
-/**
- * samplerate == number of keys in each gap encoded binary search tree
- * default value is log^2 n where n is the number of keys.
- */
-BSGAP::BSGAP(ulong *B, ulong u_, bool freeB, ulong sampleRate)
- : u(u_), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0)
-{
- // Count keys
- for (ulong i = 0; i < u; i ++)
- if (Tools::GetField(B, 1, i))
- this->n ++;
-
- if (n == 0) // Sanity check
- {
- if (freeB)
- delete [] B;
-
- return;
- }
-
- unsigned log2n = Tools::FloorLog2(this->n);
- if (log2n < 2)
- log2n = 2;
- if (sampleRate == 0)
- this->b = log2n * log2n;
- else
- this->b = sampleRate;
-
- ulong firstKey = 0, lastKey;
- // Find the first key
- while (!Tools::GetField(B, 1, firstKey))
- firstKey ++;
-
- // Temp array of BSGAP structures
- ulong **tempP = new ulong * [n / b + 1];
- ulong *bitsInBSD = new ulong [n / b + 1];
- ulong totalBits = 0;
- // Number of nodes:
- this->topCount = n%b ? n/b + 1: n/b;
- this->firstKeyA = new BlockArray(topCount, Tools::CeilLog2(this->u));
-
- for (ulong i = 0; i < n/b; i ++)
- {
- // Find the last key
- lastKey = firstKey;
- for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++)
- if (Tools::GetField(B, 1, lastKey))
- k ++;
- lastKey --;
- // Sanity check
- if (lastKey > u)
- {
- cout << "error: lastKey = " << lastKey << ", u = " << u << endl;
- exit(0);
- }
-
- // lastKey of the last substructure is this->u
- if (i == n/b - 1 && n%b == 0)
- lastKey = u;
-
- tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]);
- totalBits += bitsInBSD[i];
- (*firstKeyA)[i] = firstKey;
-
- firstKey = lastKey;
- }
-
- if (n - (n/b) * b != 0)
- {
- // Find the first key
- while (!Tools::GetField(B, 1, firstKey))
- firstKey ++;
-
- tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]);
- totalBits += bitsInBSD[n/b];
- (*firstKeyA)[n/b] = firstKey;
- }
-
- // Catenate binary trees into one bitvector
- this->P = new ulong[totalBits/W + 1];
- for (ulong i = 0; i < totalBits/W + 1; ++i)
- P[i] = 0;
- this->bitsInP = totalBits;
- ulong offset = 0;
- for (ulong i = 0; i < this->topCount ; ++i)
- {
- BitCopy(P, tempP[i], offset, bitsInBSD[i]);
- delete [] tempP[i];
- offset += bitsInBSD[i];
- }
- delete [] tempP;
-
- // Pointers to bit vector P (binary tree starting offsets)
- // FIXME Use more succinct representation?
- offset = 0;
- this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits));
- for (ulong i = 0; i < this->topCount ; ++i)
- {
- (*offsetA)[i] = offset;
- offset += bitsInBSD[i];
- }
- delete [] bitsInBSD;
-
- if (freeB)
- delete [] B;
-}
-
-BSGAP::~BSGAP()
-{
- delete [] P;
- delete offsetA;
- delete firstKeyA;
-}
-
-ulong BSGAP::rank(ulong i)
-{
- if (n == 0) // Trivial case
- return 0;
- if (i >= u)
- throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
- if ((*firstKeyA)[0] > i)
- return 0;
- ulong l = 0, r = topCount - 1;
- while (l < r) // Binary search on the array firstKeyA
- {
- ulong mid = l + (r - l)/2lu;
- if ((*firstKeyA)[mid] < i)
- l = mid + 1;
- else
- r = mid;
- }
- if ((*firstKeyA)[l] > i && l == 0)
- return 0;
- if ((*firstKeyA)[l] == i)
- return 1 + l * b;
- if ((*firstKeyA)[l] > i)
- l --;
-
- ulong lastKey, firstKey = (*firstKeyA)[l];
- if (l == topCount - 1)
- lastKey = u;
- else
- lastKey = (*firstKeyA)[l + 1];
- ulong nSub = b;
- if (l == topCount - 1 && n%b)
- nSub = n%b;
-
- ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
- // Add rank of substructures P[0], P[1], ..., P[j - 2]
- return l * b + k;
-}
-
-ulong BSGAP::rank0(ulong i)
-{
- return i - rank(i) + 1;
-}
-
-bool BSGAP::IsBitSet(ulong i)
-{
- if (n == 0) // Trivial case
- return false;
- if (i >= u)
- return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
- if ((*firstKeyA)[0] > i)
- return false;
- ulong l = 0, r = topCount - 1;
- while (l < r) // Binary search on the array firstKeyA
- {
- ulong mid = l + (r - l)/2lu;
- if ((*firstKeyA)[mid] < i)
- l = mid + 1;
- else
- r = mid;
- }
- if ((*firstKeyA)[l] > i && l == 0)
- return false;
- if ((*firstKeyA)[l] == i)
- return true;
- if ((*firstKeyA)[l] > i)
- l --;
-
- ulong lastKey, firstKey = (*firstKeyA)[l];
- if (l == topCount - 1)
- lastKey = u;
- else
- lastKey = (*firstKeyA)[l + 1];
- ulong nSub = b;
- if (l == topCount - 1 && n%b)
- nSub = n%b;
-
- return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
-}
-
-/**
- * Returns also rank_1(i) via the second parameter.
- */
-bool BSGAP::IsBitSet(ulong i, ulong *resultRank)
-{
- *resultRank = 0;
- if (n == 0) // Trivial case
- return false;
- if (i >= u)
- {
- *resultRank = n;
- return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
- }
- if ((*firstKeyA)[0] > i)
- return false;
- ulong l = 0, r = topCount - 1;
- while (l < r) // Binary search on the array firstKeyA
- {
- ulong mid = l + (r - l)/2lu;
- if ((*firstKeyA)[mid] < i)
- l = mid + 1;
- else
- r = mid;
- }
- if ((*firstKeyA)[l] > i && l == 0)
- return false;
- if ((*firstKeyA)[l] == i)
- {
- *resultRank = 1 + l * b;
- return true;
- }
- if ((*firstKeyA)[l] > i)
- l --;
-
- ulong lastKey, firstKey = (*firstKeyA)[l];
- if (l == topCount - 1)
- lastKey = u;
- else
- lastKey = (*firstKeyA)[l + 1];
- ulong nSub = b;
- if (l == topCount - 1 && n%b)
- nSub = n%b;
-
- bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank);
- *resultRank += l * b;
- return result;
-}
-
-
-ulong BSGAP::select(ulong i)
-{
- if (n == 0)
- return 0;
- if (i == 0)
- return 0;
- if (i > n)
- throw std::out_of_range("BSGAP::select(): Parameter i was out of range");
-
- ulong j = i/b;
- if (i%b == 0)
- j --;
- ulong lastKey, firstKey = (*firstKeyA)[j];
- if (j == topCount - 1)
- lastKey = u;
- else
- lastKey = (*firstKeyA)[j + 1];
- ulong nSub = b;
- if (j == topCount - 1 && n%b)
- nSub = n%b;
-
- return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true);
-}
-
-ulong BSGAP::select0(ulong i)
-{
- if (n == 0) // Trivial case
- return i-1;
- if (i > u - n)
- throw std::out_of_range("BSGAP::select0(): Parameter i was out of range");
-
- ulong l = 0, r = topCount - 1;
- while (l < r) // Binary search on the array firstKeyA
- {
- ulong mid = l + (r - l)/2lu;
- if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid]
- l = mid + 1;
- else
- r = mid;
- }
- if (l == 0 && (*firstKeyA)[l] >= i)
- return i-1;
- if (l > 0 && (*firstKeyA)[l] - l*b >= i)
- --l; // FIXME: TEST
-
- i -= (*firstKeyA)[l] - l*b;
-
- ulong lastKey, firstKey = (*firstKeyA)[l];
- if (l == topCount - 1)
- lastKey = u;
- else
- lastKey = (*firstKeyA)[l + 1];
- ulong nSub = b;
- if (l == topCount - 1 && n%b)
- nSub = n%b;
-
- return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
-}
-
-ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
-{
- while (1)
- {
- if (n == 0)
- return l+i;
-
- // Check if subtree is full --- FIXME Make this test an assert()!
- if (IsSubtreeFull(firstKey, l, r, n))
- throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i);
-
- // Parse key value
- ulong key;
- ulong x = DeltaDecode(B, offset);
- if (leftChild)
- key = r - x;
- else
- key = l + x;
-
- if (numberOfZeros(firstKey, l, key, n) < i)
- {
- offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
- i = i - numberOfZeros(firstKey, l, key, n);
- l = key;
- n = n - n/2 - 1;
- leftChild = false;
- }
- else
- {
- offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
- r = key;
- n = n/2;
- leftChild = true;
- }
- }
-}
-
-
-ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
-{
- // Number of keys in subtrees on left-side
- ulong result = 0;
-
- while (1)
- {
-
- if (n == 0)
- return result;
-
- // Check if subtree is full
- if (IsSubtreeFull(firstKey, l, r, n))
- return result + (l == firstKey ? i - l + 1 : i - l);
-
- // Parse key value
- ulong key;
- ulong x = DeltaDecode(B, offset);
- if (leftChild)
- key = r - x;
- else
- key = l + x;
- if (key == i)
- return result + n/2 + 1;
-
- if (key < i)
- {
- offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
- result += n/2 + 1;
- l = key;
- n = n - n/2 - 1;
- leftChild = false;
- }
- else
- {
- offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
- r = key;
- n = n/2;
- leftChild = true;
- }
- }
-}
-
-
-bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
-{
- // Number of keys in subtrees on left-side
- ulong result = 0;
-
- while (1)
- {
-
- if (n == 0)
- return false;
-
- // Check if subtree is full
- if (IsSubtreeFull(firstKey, l, r, n))
- return result + (l == firstKey ? i - l + 1 : i - l);
-
- // Parse key value
- ulong key;
- ulong x = DeltaDecode(B, offset);
- if (leftChild)
- key = r - x;
- else
- key = l + x;
- if (key == i)
- return true;
-
- if (key < i)
- {
- offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
- result += n/2 + 1;
- l = key;
- n = n - n/2 - 1;
- leftChild = false;
- }
- else
- {
- offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
- r = key;
- n = n/2;
- leftChild = true;
- }
- }
-}
-
-// Returns also the rank of i
-bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank)
-{
- // Number of keys in subtrees on left-side
- ulong result = 0;
-
- while (1)
- {
-
- if (n == 0)
- {
- *resultRank = result;
- return false;
- }
-
- // Check if subtree is full
- if (IsSubtreeFull(firstKey, l, r, n))
- {
- *resultRank = result + (l == firstKey ? i - l + 1 : i - l);
- return true;
- }
-
- // Parse key value
- ulong key;
- ulong x = DeltaDecode(B, offset);
- if (leftChild)
- key = r - x;
- else
- key = l + x;
- if (key == i)
- {
- *resultRank = result + n/2 + 1;
- return true;
- }
-
- if (key < i)
- {
- offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
- result += n/2 + 1;
- l = key;
- n = n - n/2 - 1;
- leftChild = false;
- }
- else
- {
- offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
- r = key;
- n = n/2;
- leftChild = true;
- }
- }
-}
-
-
-ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
-{
- while (1)
- {
- if (n == 0)
- return 0;
-
- // Check if subtree is full
- if (IsSubtreeFull(firstKey, l, r, n))
- return (l == firstKey ? l + i - 1: l + i);
-
- // Parse key value
- ulong key;
- ulong x = DeltaDecode(B, offset);
- if (leftChild)
- key = r - x;
- else
- key = l + x;
- if (n/2 + 1 == i)
- return key;
- if (n/2 + 1 < i)
- {
- offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
- i = i - n/2 - 1;
- l = key;
- n = n - n/2 - 1;
- leftChild = false;
- }
- else
- {
- offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
- r = key;
- n = n/2;
- leftChild = true;
- }
- }
-}
-
-ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits)
-{
- bits = 0;
- if (n == 0)
- return 0;
-
- // Check if subtree is full
- if (IsSubtreeFull(firstKey, l, r, n))
- return 0;
-
- // Pick the median
- ulong key = FindMedian(B, firstKey, l, r, n);
-
- ulong *leftSub, *rightSub, leftBits, rightBits;
- leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits);
- rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits);
-
- // Encode key value
- ulong *keyDelta, keyBits;
- if (leftChild)
- keyDelta = DeltaEncode(r - key, keyBits, true);
- else
- keyDelta = DeltaEncode(key - l, keyBits, true);
-
- // Encode jump offset if left and right subtrees exists
- ulong *jumpOffset = 0, jumpBits = 0;
- if (leftBits != 0 && rightBits != 0)
- jumpOffset = DeltaEncode(leftBits, jumpBits);
-
- // bits is the sum of keyBits, jumpBits, leftBits and rightBits
- bits = keyBits + jumpBits + leftBits + rightBits;
- ulong *output = new ulong[bits / W + 1];
- for (ulong i = 0; i < bits/W+1; ++i)
- output[i] = 0;
-
- /**
- * Write "output"
- */
- BitCopy(output, keyDelta, 0, keyBits);
- delete [] keyDelta;
- ulong offset = keyBits;
-
- if (jumpBits != 0)
- {
- BitCopy(output, jumpOffset, offset, jumpBits);
- offset += jumpBits;
- delete [] jumpOffset;
- }
-
- BitCopy(output, leftSub, offset, leftBits);
- offset += leftBits;
- BitCopy(output, rightSub, offset, rightBits);
- offset += rightBits;
-
- assert(offset == bits);
- if (leftBits != 0)
- delete [] leftSub;
- if (rightBits != 0)
- delete [] rightSub;
- return output;
-}
-
-void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len)
-{
- if (len == 0)
- return;
- ulong i;
-
- if (len <= W)
- {
- Tools::SetVariableField(dest, len, offset, *src);
- return;
- }
-
- for (i = 0; i < len/W; i ++)
- Tools::SetVariableField(dest, W, offset + i * W, src[i]);
-
- if (i * W < len)
- Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]);
-}
-
-ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n)
-{
- // Linear scan: slow but affects only the construction time
- n = n/2 + 1;
- if (l != firstKey)
- l ++; // Skip left boundary
-
- while (n && l <= r)
- {
- if (Tools::GetField(B, 1, l))
- n --;
- l ++;
- }
- return l - 1;
-}
-
-ulong BSGAP::DeltaDecode(ulong *B, ulong &offset)
-{
- // Dense coding
- // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
- const unsigned s = 180; // FIXME These should be class variables
- const unsigned c = 256 - s; // FIXME Choose <s,c> values!
- const unsigned b = 8; // one byte.
-
- ulong i = 0;
- ulong base = 0;
- ulong tot = s;
- while (Tools::GetVariableField(B, b, offset) < c)
- {
- i = i*c + Tools::GetVariableField(B, b, offset);
- offset += b;
- base += tot;
- tot = tot * c;
- }
- i = i * s + (Tools::GetVariableField(B, b, offset) - c);
- offset += b;
- return i + base;
-}
-
-
-ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative)
-{
- // Dense coding
- // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
- const unsigned s = 180; // FIXME These should be class variables
- const unsigned c = 256 - s; // FIXME Choose <s,c> values!
- const unsigned b = 8; // one byte
-
- // Calculate size first:
- bits = b;
- ulong x = value / s;
- while (x > 0)
- {
- --x;
- bits += b;
- x = x/c;
- }
-
- // bits is now the length of the whole codeword
- ulong *B = new ulong[bits/W + 1];
-
- // output codeword (backwards):
- unsigned offset = bits - b;
- ulong output = c + (value % s);
- Tools::SetVariableField(B, b, offset, output);
-
- x = value / s;
- while (x > 0)
- {
- --x;
- offset -= b;
-
- output = x % c;
- Tools::SetVariableField(B, b, offset, output);
-
- x = x/c;
- }
-
- return B;
-}
-
-
-/**
- * Saving the following data fields:
- * ulong u, n; // Universe size, number of 1-bits in B
- * ulong topCount; // Top structure and the number of substructures
- * ulong bitsInP;
- * ulong *P; // Pointer to BSGAP structures
- * unsigned b; // Subdictionary size (\log^2 n)
- * BlockArray *offsetA; // Array of pointers (into bitvector P)
- * BlockArray *firstKeyA; // Array of first key positions of the substructures
- */
-void BSGAP::Save(FILE *file) const
-{
- if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Save(): file write error (u).");
-
- if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Save(): file write error (n).");
-
- if (n == 0)
- return; // Done.
-
- if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Save(): file write error (topCount).");
-
- if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Save(): file write error (bitsInP).");
-
- for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
- {
- if (std::fwrite((this->P)+offset, sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Save(): file write error (P).");
- }
-
- if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("BSGAP::Save(): file write error (b).");
-
- offsetA->Save(file);
- firstKeyA->Save(file);
-}
-
-/**
- * Load from file
- */
-BSGAP::BSGAP(FILE *file)
- : u(0), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0)
-{
- if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Load(): file read error (u).");
-
- if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Load(): file read error (n).");
-
- if (n == 0)
- return; // Done.
-
- if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Load(): file read error (topCount).");
-
- if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Load(): file read error (bitsInP).");
-
- P = new ulong[bitsInP/W+1];
- for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
- {
- if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1)
- throw std::runtime_error("BSGAP::Load(): file read error (P).");
- }
-
- if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1)
- throw std::runtime_error("BSGAP::Load(): file read error (b).");
-
- offsetA = new BlockArray(file);
- firstKeyA = new BlockArray(file);
-}
-