X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=BSGAP.cpp;fp=BSGAP.cpp;h=b2db287df832abf8d689a02582d95eab2825ce0f;hb=dbd62fe27e80414b4cf4fbf512ad1910916e6967;hp=0000000000000000000000000000000000000000;hpb=dee0161e4f31f06e9389db9986766395c1b1d2b8;p=SXSI%2FTextCollection.git diff --git a/BSGAP.cpp b/BSGAP.cpp new file mode 100644 index 0000000..b2db287 --- /dev/null +++ b/BSGAP.cpp @@ -0,0 +1,779 @@ +/** + * Binary-searchable gap encoding scheme (BSGAP) + * Niko Välimäki + * + * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp + * + * Based on: + * + * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures: + * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007). + */ + +#include "BSGAP.h" +#include +#include +using namespace std; + + +const uchar BSGAP::bit_table[] = { + 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0, + 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0, + 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0, + 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0, + 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0, + 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0, + 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1}; + +/** + * samplerate == number of keys in each gap encoded binary search tree + * default value is log^2 n where n is the number of keys. + */ +BSGAP::BSGAP(ulong *B, ulong u, bool freeB, ulong sampleRate) +{ + this->u = u; + this->n = 0; + this->P = 0; + this->offsetA = 0; + this->firstKeyA = 0; + this->topCount = 0; + + // Count keys + for (ulong i = 0; i < u; i ++) + if (Tools::GetField(B, 1, i)) + this->n ++; + + if (n == 0) // Sanity check + { + if (freeB && B) + delete [] B; + + return; + } + + unsigned log2n = Tools::FloorLog2(this->n); + if (log2n < 2) + log2n = 2; + if (sampleRate == 0) + this->b = log2n * log2n; + else + this->b = sampleRate; + + ulong firstKey = 0, lastKey; + // Find the first key + while (!Tools::GetField(B, 1, firstKey)) + firstKey ++; + + // Temp array of BSGAP structures + ulong **tempP = new ulong * [n / b + 1]; + ulong *bitsInBSD = new ulong [n / b + 1]; + ulong totalBits = 0; + this->firstKeyA = new BlockArray(n/b + 1, Tools::CeilLog2(this->u)); + + for (ulong i = 0; i < n/b; i ++) + { + // Find the last key + lastKey = firstKey; + for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++) + if (Tools::GetField(B, 1, lastKey)) + k ++; + lastKey --; + // Sanity check + if (lastKey > u) + { + cout << "error: lastKey = " << lastKey << ", u = " << u << endl; + exit(0); + } + + // lastKey of the last substructure is this->u + if (i == n/b - 1 && n%b == 0) + lastKey = u; + + tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]); // FIXME: left of right subtree? + totalBits += bitsInBSD[i]; + (*firstKeyA)[i] = firstKey; + + firstKey = lastKey; + } + + if (n - (n/b) * b != 0) + { + // Find the first key + while (!Tools::GetField(B, 1, firstKey)) + firstKey ++; + + tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]); // FIXME: left of right subtree? + totalBits += bitsInBSD[n/b]; + (*firstKeyA)[n/b] = firstKey; + } + + // Number of nodes: + this->topCount = n%b ? n/b + 1: n/b; + + // Catenate binary trees into one bitvector + this->P = new ulong[totalBits/W + 1]; + this->bitsInP = totalBits; + ulong offset = 0; + for (ulong i = 0; i < this->topCount ; ++i) + { + BitCopy(P, tempP[i], offset, bitsInBSD[i]); + delete [] tempP[i]; + offset += bitsInBSD[i]; + } + delete [] tempP; + + // Pointers to bit vector P (binary tree starting offsets) + // FIXME Use more succinct representation? + offset = 0; + this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits)); + for (ulong i = 0; i < this->topCount ; ++i) + { + (*offsetA)[i] = offset; + offset += bitsInBSD[i]; + } + delete [] bitsInBSD; + + if (freeB) + delete [] B; +} + +BSGAP::~BSGAP() +{ + delete [] P; + delete offsetA; + delete firstKeyA; +} + +ulong BSGAP::rank(ulong i) +{ + if (n == 0) // Trivial case + return 0; + if (i >= u) + throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); + if ((*firstKeyA)[0] > i) + return 0; + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] < i) + l = mid + 1; + else + r = mid; + } + if ((*firstKeyA)[l] > i && l == 0) + return 0; + if ((*firstKeyA)[l] == i) + return 1 + l * b; + if ((*firstKeyA)[l] > i) + l --; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); + // Add rank of substructures P[0], P[1], ..., P[j - 2] + return l * b + k; +} + +ulong BSGAP::rank0(ulong i) +{ + return i - rank(i) + 1; +} + +bool BSGAP::IsBitSet(ulong i) +{ + if (n == 0) // Trivial case + return false; + if (i >= u) + return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); + if ((*firstKeyA)[0] > i) + return false; + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] < i) + l = mid + 1; + else + r = mid; + } + if ((*firstKeyA)[l] > i && l == 0) + return false; + if ((*firstKeyA)[l] == i) + return true; + if ((*firstKeyA)[l] > i) + l --; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); +} + +/** + * Returns also rank_1(i) via the second parameter. + */ +bool BSGAP::IsBitSet(ulong i, ulong *resultRank) +{ + *resultRank = 0; + if (n == 0) // Trivial case + return false; + if (i >= u) + { + *resultRank = n; + return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); + } + if ((*firstKeyA)[0] > i) + return false; + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] < i) + l = mid + 1; + else + r = mid; + } + if ((*firstKeyA)[l] > i && l == 0) + return false; + if ((*firstKeyA)[l] == i) + { + *resultRank = 1 + l * b; + return true; + } + if ((*firstKeyA)[l] > i) + l --; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank); + *resultRank += l * b; + return result; +} + + +ulong BSGAP::select(ulong i) +{ + if (n == 0) + return 0; + if (i == 0) + return 0; + if (i > n) + throw std::out_of_range("BSGAP::select(): Parameter i was out of range"); + + ulong j = i/b; + if (i%b == 0) + j --; + ulong lastKey, firstKey = (*firstKeyA)[j]; + if (j == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[j + 1]; + ulong nSub = b; + if (j == topCount - 1 && n%b) + nSub = n%b; + + return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true); +} + +ulong BSGAP::select0(ulong i) +{ + if (n == 0) // Trivial case + return i-1; + if (i > u - n) + throw std::out_of_range("BSGAP::select0(): Parameter i was out of range"); + + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid] + l = mid + 1; + else + r = mid; + } + if (l == 0 && (*firstKeyA)[l] >= i) + return i-1; + if (l > 0 && (*firstKeyA)[l] - l*b >= i) + --l; // FIXME: TEST + + i -= (*firstKeyA)[l] - l*b; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); +} + +ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + while (1) + { + if (n == 0) + return l+i; + + // Check if subtree is full --- FIXME Make this test an assert()! + if (IsSubtreeFull(firstKey, l, r, n)) + throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + + if (numberOfZeros(firstKey, l, key, n) < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + i = i - numberOfZeros(firstKey, l, key, n); + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + + +ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + // Number of keys in subtrees on left-side + ulong result = 0; + + while (1) + { + + if (n == 0) + return result; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return result + (l == firstKey ? i - l + 1 : i - l); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (key == i) + return result + n/2 + 1; + + if (key < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + result += n/2 + 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + + +bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + // Number of keys in subtrees on left-side + ulong result = 0; + + while (1) + { + + if (n == 0) + return false; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return result + (l == firstKey ? i - l + 1 : i - l); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (key == i) + return true; + + if (key < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + result += n/2 + 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + +// Returns also the rank of i +bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank) +{ + // Number of keys in subtrees on left-side + ulong result = 0; + + while (1) + { + + if (n == 0) + { + *resultRank = result; + return false; + } + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + { + *resultRank = result + (l == firstKey ? i - l + 1 : i - l); + return true; + } + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (key == i) + { + *resultRank = result + n/2 + 1; + return true; + } + + if (key < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + result += n/2 + 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + + +ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + while (1) + { + if (n == 0) + return 0; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return (l == firstKey ? l + i - 1: l + i); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (n/2 + 1 == i) + return key; + if (n/2 + 1 < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + i = i - n/2 - 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + +ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits) +{ + bits = 0; + if (n == 0) + return 0; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return 0; + + // Pick the median + ulong key = FindMedian(B, firstKey, l, r, n); + + ulong *leftSub, *rightSub, leftBits, rightBits; + leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits); + rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits); + + // Encode key value + ulong *keyDelta, keyBits; + if (leftChild) + keyDelta = DeltaEncode(r - key, keyBits, true); + else + keyDelta = DeltaEncode(key - l, keyBits, true); + + // Encode jump offset if left and right subtrees exists + ulong *jumpOffset = 0, jumpBits = 0; + if (leftBits != 0 && rightBits != 0) + jumpOffset = DeltaEncode(leftBits, jumpBits); + + // bits is the sum of keyBits, jumpBits, leftBits and rightBits + bits = keyBits + jumpBits + leftBits + rightBits; + ulong *output = new ulong[bits / W + 1]; + + /** + * Write "output" + */ + BitCopy(output, keyDelta, 0, keyBits); + delete [] keyDelta; + ulong offset = keyBits; + + if (jumpBits != 0) + { + BitCopy(output, jumpOffset, offset, jumpBits); + offset += jumpBits; + delete [] jumpOffset; + } + + BitCopy(output, leftSub, offset, leftBits); + offset += leftBits; + BitCopy(output, rightSub, offset, rightBits); + offset += rightBits; + + assert(offset == bits); + if (leftBits != 0) + delete [] leftSub; + if (rightBits != 0) + delete [] rightSub; + return output; +} + +void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len) +{ + if (len == 0) + return; + ulong i; + + if (len <= W) + { + Tools::SetVariableField(dest, len, offset, *src); + return; + } + + for (i = 0; i < len/W; i ++) + Tools::SetVariableField(dest, W, offset + i * W, src[i]); + + if (i * W < len) + Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]); +} + +ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n) +{ + // Linear scan: slow but affects only the construction time + n = n/2 + 1; + if (l != firstKey) + l ++; // Skip left boundary + + while (n && l <= r) + { + if (Tools::GetField(B, 1, l)) + n --; + l ++; + } + return l - 1; +} + +ulong BSGAP::DeltaDecode(ulong *B, ulong &offset) +{ + // Dense coding + // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf + const unsigned s = 180; // FIXME These should be class variables + const unsigned c = 256 - s; // FIXME Choose values! + const unsigned b = 8; // one byte. + + ulong i = 0; + ulong base = 0; + ulong tot = s; + while (Tools::GetVariableField(B, b, offset) < c) + { + i = i*c + Tools::GetVariableField(B, b, offset); + offset += b; + base += tot; + tot = tot * c; + } + i = i * s + (Tools::GetVariableField(B, b, offset) - c); + offset += b; + return i + base; +} + + +ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative) +{ + // Dense coding + // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf + const unsigned s = 180; // FIXME These should be class variables + const unsigned c = 256 - s; // FIXME Choose values! + const unsigned b = 8; // one byte + + // Calculate size first: + bits = b; + ulong x = value / s; + while (x > 0) + { + --x; + bits += b; + x = x/c; + } + + // bits is now the length of the whole codeword + ulong *B = new ulong[bits/W + 1]; + + // output codeword (backwards): + unsigned offset = bits - b; + ulong output = c + (value % s); + Tools::SetVariableField(B, b, offset, output); + + x = value / s; + while (x > 0) + { + --x; + offset -= b; + + output = x % c; + Tools::SetVariableField(B, b, offset, output); + + x = x/c; + } + + return B; +} + + +/** + * Saving the following data fields: + * ulong u, n; // Universe size, number of 1-bits in B + * ulong topCount; // Top structure and the number of substructures + * ulong bitsInP; + * ulong *P; // Pointer to BSGAP structures + * unsigned b; // Subdictionary size (\log^2 n) + * BlockArray *offsetA; // Array of pointers (into bitvector P) + * BlockArray *firstKeyA; // Array of first key positions of the substructures + */ +void BSGAP::Save(FILE *file) const +{ + if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (u)."); + + if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (n)."); + + if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (topCount)."); + + if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (bitsInP)."); + + for (ulong offset = 0; offset < bitsInP/W+1; offset ++) + { + if (std::fwrite(this->P+offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (P)."); + } + + if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (b)."); + + offsetA->Save(file); + firstKeyA->Save(file); +} + +/** + * Load from file + */ +BSGAP::BSGAP(FILE *file) +{ + if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (u)."); + + if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (n)."); + + if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (topCount)."); + + if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (bitsInP)."); + + P = new ulong[bitsInP/W+1]; + for (ulong offset = 0; offset < bitsInP/W+1; offset ++) + { + if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (P)."); + } + + if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (b)."); + + offsetA = new BlockArray(file); + firstKeyA = new BlockArray(file); +} +