2 * Binary-searchable gap encoding scheme (BSGAP)
5 * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp
9 * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
10 * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
19 const uchar BSGAP::bit_table[] = {
20 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,
21 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,
22 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,
23 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,
24 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,
25 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,
26 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1};
29 * samplerate == number of keys in each gap encoded binary search tree
30 * default value is log^2 n where n is the number of keys.
32 BSGAP::BSGAP(ulong *B, ulong u_, bool freeB, ulong sampleRate)
33 : u(u_), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0)
36 for (ulong i = 0; i < u; i ++)
37 if (Tools::GetField(B, 1, i))
40 if (n == 0) // Sanity check
48 unsigned log2n = Tools::FloorLog2(this->n);
52 this->b = log2n * log2n;
56 ulong firstKey = 0, lastKey;
58 while (!Tools::GetField(B, 1, firstKey))
61 // Temp array of BSGAP structures
62 ulong **tempP = new ulong * [n / b + 1];
63 ulong *bitsInBSD = new ulong [n / b + 1];
65 this->firstKeyA = new BlockArray(n/b + 1, Tools::CeilLog2(this->u));
67 for (ulong i = 0; i < n/b; i ++)
71 for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++)
72 if (Tools::GetField(B, 1, lastKey))
78 cout << "error: lastKey = " << lastKey << ", u = " << u << endl;
82 // lastKey of the last substructure is this->u
83 if (i == n/b - 1 && n%b == 0)
86 tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]);
87 totalBits += bitsInBSD[i];
88 (*firstKeyA)[i] = firstKey;
93 if (n - (n/b) * b != 0)
96 while (!Tools::GetField(B, 1, firstKey))
99 tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]);
100 totalBits += bitsInBSD[n/b];
101 (*firstKeyA)[n/b] = firstKey;
105 this->topCount = n%b ? n/b + 1: n/b;
107 // Catenate binary trees into one bitvector
108 this->P = new ulong[totalBits/W + 1];
109 this->bitsInP = totalBits;
111 for (ulong i = 0; i < this->topCount ; ++i)
113 BitCopy(P, tempP[i], offset, bitsInBSD[i]);
115 offset += bitsInBSD[i];
119 // Pointers to bit vector P (binary tree starting offsets)
120 // FIXME Use more succinct representation?
122 this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits));
123 for (ulong i = 0; i < this->topCount ; ++i)
125 (*offsetA)[i] = offset;
126 offset += bitsInBSD[i];
141 ulong BSGAP::rank(ulong i)
143 if (n == 0) // Trivial case
146 throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
147 if ((*firstKeyA)[0] > i)
149 ulong l = 0, r = topCount - 1;
150 while (l < r) // Binary search on the array firstKeyA
152 ulong mid = l + (r - l)/2lu;
153 if ((*firstKeyA)[mid] < i)
158 if ((*firstKeyA)[l] > i && l == 0)
160 if ((*firstKeyA)[l] == i)
162 if ((*firstKeyA)[l] > i)
165 ulong lastKey, firstKey = (*firstKeyA)[l];
166 if (l == topCount - 1)
169 lastKey = (*firstKeyA)[l + 1];
171 if (l == topCount - 1 && n%b)
174 ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
175 // Add rank of substructures P[0], P[1], ..., P[j - 2]
179 ulong BSGAP::rank0(ulong i)
181 return i - rank(i) + 1;
184 bool BSGAP::IsBitSet(ulong i)
186 if (n == 0) // Trivial case
189 return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
190 if ((*firstKeyA)[0] > i)
192 ulong l = 0, r = topCount - 1;
193 while (l < r) // Binary search on the array firstKeyA
195 ulong mid = l + (r - l)/2lu;
196 if ((*firstKeyA)[mid] < i)
201 if ((*firstKeyA)[l] > i && l == 0)
203 if ((*firstKeyA)[l] == i)
205 if ((*firstKeyA)[l] > i)
208 ulong lastKey, firstKey = (*firstKeyA)[l];
209 if (l == topCount - 1)
212 lastKey = (*firstKeyA)[l + 1];
214 if (l == topCount - 1 && n%b)
217 return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
221 * Returns also rank_1(i) via the second parameter.
223 bool BSGAP::IsBitSet(ulong i, ulong *resultRank)
226 if (n == 0) // Trivial case
231 return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
233 if ((*firstKeyA)[0] > i)
235 ulong l = 0, r = topCount - 1;
236 while (l < r) // Binary search on the array firstKeyA
238 ulong mid = l + (r - l)/2lu;
239 if ((*firstKeyA)[mid] < i)
244 if ((*firstKeyA)[l] > i && l == 0)
246 if ((*firstKeyA)[l] == i)
248 *resultRank = 1 + l * b;
251 if ((*firstKeyA)[l] > i)
254 ulong lastKey, firstKey = (*firstKeyA)[l];
255 if (l == topCount - 1)
258 lastKey = (*firstKeyA)[l + 1];
260 if (l == topCount - 1 && n%b)
263 bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank);
264 *resultRank += l * b;
269 ulong BSGAP::select(ulong i)
276 throw std::out_of_range("BSGAP::select(): Parameter i was out of range");
281 ulong lastKey, firstKey = (*firstKeyA)[j];
282 if (j == topCount - 1)
285 lastKey = (*firstKeyA)[j + 1];
287 if (j == topCount - 1 && n%b)
290 return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true);
293 ulong BSGAP::select0(ulong i)
295 if (n == 0) // Trivial case
298 throw std::out_of_range("BSGAP::select0(): Parameter i was out of range");
300 ulong l = 0, r = topCount - 1;
301 while (l < r) // Binary search on the array firstKeyA
303 ulong mid = l + (r - l)/2lu;
304 if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid]
309 if (l == 0 && (*firstKeyA)[l] >= i)
311 if (l > 0 && (*firstKeyA)[l] - l*b >= i)
314 i -= (*firstKeyA)[l] - l*b;
316 ulong lastKey, firstKey = (*firstKeyA)[l];
317 if (l == topCount - 1)
320 lastKey = (*firstKeyA)[l + 1];
322 if (l == topCount - 1 && n%b)
325 return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
328 ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
335 // Check if subtree is full --- FIXME Make this test an assert()!
336 if (IsSubtreeFull(firstKey, l, r, n))
337 throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i);
341 ulong x = DeltaDecode(B, offset);
347 if (numberOfZeros(firstKey, l, key, n) < i)
349 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
350 i = i - numberOfZeros(firstKey, l, key, n);
357 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
366 ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
368 // Number of keys in subtrees on left-side
377 // Check if subtree is full
378 if (IsSubtreeFull(firstKey, l, r, n))
379 return result + (l == firstKey ? i - l + 1 : i - l);
383 ulong x = DeltaDecode(B, offset);
389 return result + n/2 + 1;
393 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
401 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
410 bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
412 // Number of keys in subtrees on left-side
421 // Check if subtree is full
422 if (IsSubtreeFull(firstKey, l, r, n))
423 return result + (l == firstKey ? i - l + 1 : i - l);
427 ulong x = DeltaDecode(B, offset);
437 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
445 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
453 // Returns also the rank of i
454 bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank)
456 // Number of keys in subtrees on left-side
464 *resultRank = result;
468 // Check if subtree is full
469 if (IsSubtreeFull(firstKey, l, r, n))
471 *resultRank = result + (l == firstKey ? i - l + 1 : i - l);
477 ulong x = DeltaDecode(B, offset);
484 *resultRank = result + n/2 + 1;
490 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
498 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
507 ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
514 // Check if subtree is full
515 if (IsSubtreeFull(firstKey, l, r, n))
516 return (l == firstKey ? l + i - 1: l + i);
520 ulong x = DeltaDecode(B, offset);
529 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
537 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
545 ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits)
551 // Check if subtree is full
552 if (IsSubtreeFull(firstKey, l, r, n))
556 ulong key = FindMedian(B, firstKey, l, r, n);
558 ulong *leftSub, *rightSub, leftBits, rightBits;
559 leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits);
560 rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits);
563 ulong *keyDelta, keyBits;
565 keyDelta = DeltaEncode(r - key, keyBits, true);
567 keyDelta = DeltaEncode(key - l, keyBits, true);
569 // Encode jump offset if left and right subtrees exists
570 ulong *jumpOffset = 0, jumpBits = 0;
571 if (leftBits != 0 && rightBits != 0)
572 jumpOffset = DeltaEncode(leftBits, jumpBits);
574 // bits is the sum of keyBits, jumpBits, leftBits and rightBits
575 bits = keyBits + jumpBits + leftBits + rightBits;
576 ulong *output = new ulong[bits / W + 1];
581 BitCopy(output, keyDelta, 0, keyBits);
583 ulong offset = keyBits;
587 BitCopy(output, jumpOffset, offset, jumpBits);
589 delete [] jumpOffset;
592 BitCopy(output, leftSub, offset, leftBits);
594 BitCopy(output, rightSub, offset, rightBits);
597 assert(offset == bits);
605 void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len)
613 Tools::SetVariableField(dest, len, offset, *src);
617 for (i = 0; i < len/W; i ++)
618 Tools::SetVariableField(dest, W, offset + i * W, src[i]);
621 Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]);
624 ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n)
626 // Linear scan: slow but affects only the construction time
629 l ++; // Skip left boundary
633 if (Tools::GetField(B, 1, l))
640 ulong BSGAP::DeltaDecode(ulong *B, ulong &offset)
643 // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
644 const unsigned s = 180; // FIXME These should be class variables
645 const unsigned c = 256 - s; // FIXME Choose <s,c> values!
646 const unsigned b = 8; // one byte.
651 while (Tools::GetVariableField(B, b, offset) < c)
653 i = i*c + Tools::GetVariableField(B, b, offset);
658 i = i * s + (Tools::GetVariableField(B, b, offset) - c);
664 ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative)
667 // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
668 const unsigned s = 180; // FIXME These should be class variables
669 const unsigned c = 256 - s; // FIXME Choose <s,c> values!
670 const unsigned b = 8; // one byte
672 // Calculate size first:
682 // bits is now the length of the whole codeword
683 ulong *B = new ulong[bits/W + 1];
685 // output codeword (backwards):
686 unsigned offset = bits - b;
687 ulong output = c + (value % s);
688 Tools::SetVariableField(B, b, offset, output);
697 Tools::SetVariableField(B, b, offset, output);
707 * Saving the following data fields:
708 * ulong u, n; // Universe size, number of 1-bits in B
709 * ulong topCount; // Top structure and the number of substructures
711 * ulong *P; // Pointer to BSGAP structures
712 * unsigned b; // Subdictionary size (\log^2 n)
713 * BlockArray *offsetA; // Array of pointers (into bitvector P)
714 * BlockArray *firstKeyA; // Array of first key positions of the substructures
716 void BSGAP::Save(FILE *file) const
718 if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
719 throw std::runtime_error("BSGAP::Save(): file write error (u).");
721 if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
722 throw std::runtime_error("BSGAP::Save(): file write error (n).");
727 if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1)
728 throw std::runtime_error("BSGAP::Save(): file write error (topCount).");
730 if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
731 throw std::runtime_error("BSGAP::Save(): file write error (bitsInP).");
733 for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
735 if (std::fwrite(this->P+offset, sizeof(ulong), 1, file) != 1)
736 throw std::runtime_error("BSGAP::Save(): file write error (P).");
739 if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1)
740 throw std::runtime_error("BSGAP::Save(): file write error (b).");
743 firstKeyA->Save(file);
749 BSGAP::BSGAP(FILE *file)
750 : u(0), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0)
752 if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
753 throw std::runtime_error("BSGAP::Load(): file read error (u).");
755 if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
756 throw std::runtime_error("BSGAP::Load(): file read error (n).");
761 if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1)
762 throw std::runtime_error("BSGAP::Load(): file read error (topCount).");
764 if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
765 throw std::runtime_error("BSGAP::Load(): file read error (bitsInP).");
767 P = new ulong[bitsInP/W+1];
768 for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
770 if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1)
771 throw std::runtime_error("BSGAP::Load(): file read error (P).");
774 if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1)
775 throw std::runtime_error("BSGAP::Load(): file read error (b).");
777 offsetA = new BlockArray(file);
778 firstKeyA = new BlockArray(file);