From 55ebb7f307e569d8bde2df6c6e80a5d2c5e849a4 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Mon, 20 Apr 2009 11:27:12 +0000 Subject: [PATCH] Removed git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@331 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- BSGAP.cpp | 783 ------------------------------------------------------ 1 file changed, 783 deletions(-) delete mode 100644 BSGAP.cpp diff --git a/BSGAP.cpp b/BSGAP.cpp deleted file mode 100644 index 4a3d1c6..0000000 --- a/BSGAP.cpp +++ /dev/null @@ -1,783 +0,0 @@ -/** - * Binary-searchable gap encoding scheme (BSGAP) - * Niko Välimäki - * - * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp - * - * Based on: - * - * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures: - * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007). - */ - -#include "BSGAP.h" -#include -#include -using namespace std; - - -const uchar BSGAP::bit_table[] = { - 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0, - 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0, - 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0, - 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0, - 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0, - 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0, - 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1}; - -/** - * samplerate == number of keys in each gap encoded binary search tree - * default value is log^2 n where n is the number of keys. - */ -BSGAP::BSGAP(ulong *B, ulong u_, bool freeB, ulong sampleRate) - : u(u_), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0) -{ - // Count keys - for (ulong i = 0; i < u; i ++) - if (Tools::GetField(B, 1, i)) - this->n ++; - - if (n == 0) // Sanity check - { - if (freeB) - delete [] B; - - return; - } - - unsigned log2n = Tools::FloorLog2(this->n); - if (log2n < 2) - log2n = 2; - if (sampleRate == 0) - this->b = log2n * log2n; - else - this->b = sampleRate; - - ulong firstKey = 0, lastKey; - // Find the first key - while (!Tools::GetField(B, 1, firstKey)) - firstKey ++; - - // Temp array of BSGAP structures - ulong **tempP = new ulong * [n / b + 1]; - ulong *bitsInBSD = new ulong [n / b + 1]; - ulong totalBits = 0; - // Number of nodes: - this->topCount = n%b ? n/b + 1: n/b; - this->firstKeyA = new BlockArray(topCount, Tools::CeilLog2(this->u)); - - for (ulong i = 0; i < n/b; i ++) - { - // Find the last key - lastKey = firstKey; - for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++) - if (Tools::GetField(B, 1, lastKey)) - k ++; - lastKey --; - // Sanity check - if (lastKey > u) - { - cout << "error: lastKey = " << lastKey << ", u = " << u << endl; - exit(0); - } - - // lastKey of the last substructure is this->u - if (i == n/b - 1 && n%b == 0) - lastKey = u; - - tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]); - totalBits += bitsInBSD[i]; - (*firstKeyA)[i] = firstKey; - - firstKey = lastKey; - } - - if (n - (n/b) * b != 0) - { - // Find the first key - while (!Tools::GetField(B, 1, firstKey)) - firstKey ++; - - tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]); - totalBits += bitsInBSD[n/b]; - (*firstKeyA)[n/b] = firstKey; - } - - // Catenate binary trees into one bitvector - this->P = new ulong[totalBits/W + 1]; - for (ulong i = 0; i < totalBits/W + 1; ++i) - P[i] = 0; - this->bitsInP = totalBits; - ulong offset = 0; - for (ulong i = 0; i < this->topCount ; ++i) - { - BitCopy(P, tempP[i], offset, bitsInBSD[i]); - delete [] tempP[i]; - offset += bitsInBSD[i]; - } - delete [] tempP; - - // Pointers to bit vector P (binary tree starting offsets) - // FIXME Use more succinct representation? - offset = 0; - this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits)); - for (ulong i = 0; i < this->topCount ; ++i) - { - (*offsetA)[i] = offset; - offset += bitsInBSD[i]; - } - delete [] bitsInBSD; - - if (freeB) - delete [] B; -} - -BSGAP::~BSGAP() -{ - delete [] P; - delete offsetA; - delete firstKeyA; -} - -ulong BSGAP::rank(ulong i) -{ - if (n == 0) // Trivial case - return 0; - if (i >= u) - throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); - if ((*firstKeyA)[0] > i) - return 0; - ulong l = 0, r = topCount - 1; - while (l < r) // Binary search on the array firstKeyA - { - ulong mid = l + (r - l)/2lu; - if ((*firstKeyA)[mid] < i) - l = mid + 1; - else - r = mid; - } - if ((*firstKeyA)[l] > i && l == 0) - return 0; - if ((*firstKeyA)[l] == i) - return 1 + l * b; - if ((*firstKeyA)[l] > i) - l --; - - ulong lastKey, firstKey = (*firstKeyA)[l]; - if (l == topCount - 1) - lastKey = u; - else - lastKey = (*firstKeyA)[l + 1]; - ulong nSub = b; - if (l == topCount - 1 && n%b) - nSub = n%b; - - ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); - // Add rank of substructures P[0], P[1], ..., P[j - 2] - return l * b + k; -} - -ulong BSGAP::rank0(ulong i) -{ - return i - rank(i) + 1; -} - -bool BSGAP::IsBitSet(ulong i) -{ - if (n == 0) // Trivial case - return false; - if (i >= u) - return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); - if ((*firstKeyA)[0] > i) - return false; - ulong l = 0, r = topCount - 1; - while (l < r) // Binary search on the array firstKeyA - { - ulong mid = l + (r - l)/2lu; - if ((*firstKeyA)[mid] < i) - l = mid + 1; - else - r = mid; - } - if ((*firstKeyA)[l] > i && l == 0) - return false; - if ((*firstKeyA)[l] == i) - return true; - if ((*firstKeyA)[l] > i) - l --; - - ulong lastKey, firstKey = (*firstKeyA)[l]; - if (l == topCount - 1) - lastKey = u; - else - lastKey = (*firstKeyA)[l + 1]; - ulong nSub = b; - if (l == topCount - 1 && n%b) - nSub = n%b; - - return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); -} - -/** - * Returns also rank_1(i) via the second parameter. - */ -bool BSGAP::IsBitSet(ulong i, ulong *resultRank) -{ - *resultRank = 0; - if (n == 0) // Trivial case - return false; - if (i >= u) - { - *resultRank = n; - return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); - } - if ((*firstKeyA)[0] > i) - return false; - ulong l = 0, r = topCount - 1; - while (l < r) // Binary search on the array firstKeyA - { - ulong mid = l + (r - l)/2lu; - if ((*firstKeyA)[mid] < i) - l = mid + 1; - else - r = mid; - } - if ((*firstKeyA)[l] > i && l == 0) - return false; - if ((*firstKeyA)[l] == i) - { - *resultRank = 1 + l * b; - return true; - } - if ((*firstKeyA)[l] > i) - l --; - - ulong lastKey, firstKey = (*firstKeyA)[l]; - if (l == topCount - 1) - lastKey = u; - else - lastKey = (*firstKeyA)[l + 1]; - ulong nSub = b; - if (l == topCount - 1 && n%b) - nSub = n%b; - - bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank); - *resultRank += l * b; - return result; -} - - -ulong BSGAP::select(ulong i) -{ - if (n == 0) - return 0; - if (i == 0) - return 0; - if (i > n) - throw std::out_of_range("BSGAP::select(): Parameter i was out of range"); - - ulong j = i/b; - if (i%b == 0) - j --; - ulong lastKey, firstKey = (*firstKeyA)[j]; - if (j == topCount - 1) - lastKey = u; - else - lastKey = (*firstKeyA)[j + 1]; - ulong nSub = b; - if (j == topCount - 1 && n%b) - nSub = n%b; - - return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true); -} - -ulong BSGAP::select0(ulong i) -{ - if (n == 0) // Trivial case - return i-1; - if (i > u - n) - throw std::out_of_range("BSGAP::select0(): Parameter i was out of range"); - - ulong l = 0, r = topCount - 1; - while (l < r) // Binary search on the array firstKeyA - { - ulong mid = l + (r - l)/2lu; - if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid] - l = mid + 1; - else - r = mid; - } - if (l == 0 && (*firstKeyA)[l] >= i) - return i-1; - if (l > 0 && (*firstKeyA)[l] - l*b >= i) - --l; // FIXME: TEST - - i -= (*firstKeyA)[l] - l*b; - - ulong lastKey, firstKey = (*firstKeyA)[l]; - if (l == topCount - 1) - lastKey = u; - else - lastKey = (*firstKeyA)[l + 1]; - ulong nSub = b; - if (l == topCount - 1 && n%b) - nSub = n%b; - - return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); -} - -ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) -{ - while (1) - { - if (n == 0) - return l+i; - - // Check if subtree is full --- FIXME Make this test an assert()! - if (IsSubtreeFull(firstKey, l, r, n)) - throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i); - - // Parse key value - ulong key; - ulong x = DeltaDecode(B, offset); - if (leftChild) - key = r - x; - else - key = l + x; - - if (numberOfZeros(firstKey, l, key, n) < i) - { - offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); - i = i - numberOfZeros(firstKey, l, key, n); - l = key; - n = n - n/2 - 1; - leftChild = false; - } - else - { - offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); - r = key; - n = n/2; - leftChild = true; - } - } -} - - -ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) -{ - // Number of keys in subtrees on left-side - ulong result = 0; - - while (1) - { - - if (n == 0) - return result; - - // Check if subtree is full - if (IsSubtreeFull(firstKey, l, r, n)) - return result + (l == firstKey ? i - l + 1 : i - l); - - // Parse key value - ulong key; - ulong x = DeltaDecode(B, offset); - if (leftChild) - key = r - x; - else - key = l + x; - if (key == i) - return result + n/2 + 1; - - if (key < i) - { - offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); - result += n/2 + 1; - l = key; - n = n - n/2 - 1; - leftChild = false; - } - else - { - offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); - r = key; - n = n/2; - leftChild = true; - } - } -} - - -bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) -{ - // Number of keys in subtrees on left-side - ulong result = 0; - - while (1) - { - - if (n == 0) - return false; - - // Check if subtree is full - if (IsSubtreeFull(firstKey, l, r, n)) - return result + (l == firstKey ? i - l + 1 : i - l); - - // Parse key value - ulong key; - ulong x = DeltaDecode(B, offset); - if (leftChild) - key = r - x; - else - key = l + x; - if (key == i) - return true; - - if (key < i) - { - offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); - result += n/2 + 1; - l = key; - n = n - n/2 - 1; - leftChild = false; - } - else - { - offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); - r = key; - n = n/2; - leftChild = true; - } - } -} - -// Returns also the rank of i -bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank) -{ - // Number of keys in subtrees on left-side - ulong result = 0; - - while (1) - { - - if (n == 0) - { - *resultRank = result; - return false; - } - - // Check if subtree is full - if (IsSubtreeFull(firstKey, l, r, n)) - { - *resultRank = result + (l == firstKey ? i - l + 1 : i - l); - return true; - } - - // Parse key value - ulong key; - ulong x = DeltaDecode(B, offset); - if (leftChild) - key = r - x; - else - key = l + x; - if (key == i) - { - *resultRank = result + n/2 + 1; - return true; - } - - if (key < i) - { - offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); - result += n/2 + 1; - l = key; - n = n - n/2 - 1; - leftChild = false; - } - else - { - offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); - r = key; - n = n/2; - leftChild = true; - } - } -} - - -ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) -{ - while (1) - { - if (n == 0) - return 0; - - // Check if subtree is full - if (IsSubtreeFull(firstKey, l, r, n)) - return (l == firstKey ? l + i - 1: l + i); - - // Parse key value - ulong key; - ulong x = DeltaDecode(B, offset); - if (leftChild) - key = r - x; - else - key = l + x; - if (n/2 + 1 == i) - return key; - if (n/2 + 1 < i) - { - offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); - i = i - n/2 - 1; - l = key; - n = n - n/2 - 1; - leftChild = false; - } - else - { - offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); - r = key; - n = n/2; - leftChild = true; - } - } -} - -ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits) -{ - bits = 0; - if (n == 0) - return 0; - - // Check if subtree is full - if (IsSubtreeFull(firstKey, l, r, n)) - return 0; - - // Pick the median - ulong key = FindMedian(B, firstKey, l, r, n); - - ulong *leftSub, *rightSub, leftBits, rightBits; - leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits); - rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits); - - // Encode key value - ulong *keyDelta, keyBits; - if (leftChild) - keyDelta = DeltaEncode(r - key, keyBits, true); - else - keyDelta = DeltaEncode(key - l, keyBits, true); - - // Encode jump offset if left and right subtrees exists - ulong *jumpOffset = 0, jumpBits = 0; - if (leftBits != 0 && rightBits != 0) - jumpOffset = DeltaEncode(leftBits, jumpBits); - - // bits is the sum of keyBits, jumpBits, leftBits and rightBits - bits = keyBits + jumpBits + leftBits + rightBits; - ulong *output = new ulong[bits / W + 1]; - for (ulong i = 0; i < bits/W+1; ++i) - output[i] = 0; - - /** - * Write "output" - */ - BitCopy(output, keyDelta, 0, keyBits); - delete [] keyDelta; - ulong offset = keyBits; - - if (jumpBits != 0) - { - BitCopy(output, jumpOffset, offset, jumpBits); - offset += jumpBits; - delete [] jumpOffset; - } - - BitCopy(output, leftSub, offset, leftBits); - offset += leftBits; - BitCopy(output, rightSub, offset, rightBits); - offset += rightBits; - - assert(offset == bits); - if (leftBits != 0) - delete [] leftSub; - if (rightBits != 0) - delete [] rightSub; - return output; -} - -void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len) -{ - if (len == 0) - return; - ulong i; - - if (len <= W) - { - Tools::SetVariableField(dest, len, offset, *src); - return; - } - - for (i = 0; i < len/W; i ++) - Tools::SetVariableField(dest, W, offset + i * W, src[i]); - - if (i * W < len) - Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]); -} - -ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n) -{ - // Linear scan: slow but affects only the construction time - n = n/2 + 1; - if (l != firstKey) - l ++; // Skip left boundary - - while (n && l <= r) - { - if (Tools::GetField(B, 1, l)) - n --; - l ++; - } - return l - 1; -} - -ulong BSGAP::DeltaDecode(ulong *B, ulong &offset) -{ - // Dense coding - // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf - const unsigned s = 180; // FIXME These should be class variables - const unsigned c = 256 - s; // FIXME Choose values! - const unsigned b = 8; // one byte. - - ulong i = 0; - ulong base = 0; - ulong tot = s; - while (Tools::GetVariableField(B, b, offset) < c) - { - i = i*c + Tools::GetVariableField(B, b, offset); - offset += b; - base += tot; - tot = tot * c; - } - i = i * s + (Tools::GetVariableField(B, b, offset) - c); - offset += b; - return i + base; -} - - -ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative) -{ - // Dense coding - // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf - const unsigned s = 180; // FIXME These should be class variables - const unsigned c = 256 - s; // FIXME Choose values! - const unsigned b = 8; // one byte - - // Calculate size first: - bits = b; - ulong x = value / s; - while (x > 0) - { - --x; - bits += b; - x = x/c; - } - - // bits is now the length of the whole codeword - ulong *B = new ulong[bits/W + 1]; - - // output codeword (backwards): - unsigned offset = bits - b; - ulong output = c + (value % s); - Tools::SetVariableField(B, b, offset, output); - - x = value / s; - while (x > 0) - { - --x; - offset -= b; - - output = x % c; - Tools::SetVariableField(B, b, offset, output); - - x = x/c; - } - - return B; -} - - -/** - * Saving the following data fields: - * ulong u, n; // Universe size, number of 1-bits in B - * ulong topCount; // Top structure and the number of substructures - * ulong bitsInP; - * ulong *P; // Pointer to BSGAP structures - * unsigned b; // Subdictionary size (\log^2 n) - * BlockArray *offsetA; // Array of pointers (into bitvector P) - * BlockArray *firstKeyA; // Array of first key positions of the substructures - */ -void BSGAP::Save(FILE *file) const -{ - if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Save(): file write error (u)."); - - if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Save(): file write error (n)."); - - if (n == 0) - return; // Done. - - if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Save(): file write error (topCount)."); - - if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Save(): file write error (bitsInP)."); - - for (ulong offset = 0; offset < bitsInP/W+1; offset ++) - { - if (std::fwrite((this->P)+offset, sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Save(): file write error (P)."); - } - - if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("BSGAP::Save(): file write error (b)."); - - offsetA->Save(file); - firstKeyA->Save(file); -} - -/** - * Load from file - */ -BSGAP::BSGAP(FILE *file) - : u(0), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0) -{ - if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Load(): file read error (u)."); - - if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Load(): file read error (n)."); - - if (n == 0) - return; // Done. - - if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Load(): file read error (topCount)."); - - if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Load(): file read error (bitsInP)."); - - P = new ulong[bitsInP/W+1]; - for (ulong offset = 0; offset < bitsInP/W+1; offset ++) - { - if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1) - throw std::runtime_error("BSGAP::Load(): file read error (P)."); - } - - if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("BSGAP::Load(): file read error (b)."); - - offsetA = new BlockArray(file); - firstKeyA = new BlockArray(file); -} - -- 2.17.1