2 * Binary-searchable gap encoding scheme (BSGAP)
5 * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp
9 * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
10 * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
19 const uchar BSGAP::bit_table[] = {
20 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,
21 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,
22 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,
23 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,
24 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,
25 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,
26 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1};
29 * samplerate == number of keys in each gap encoded binary search tree
30 * default value is log^2 n where n is the number of keys.
32 BSGAP::BSGAP(ulong *B, ulong u, bool freeB, ulong sampleRate)
42 for (ulong i = 0; i < u; i ++)
43 if (Tools::GetField(B, 1, i))
46 if (n == 0) // Sanity check
54 unsigned log2n = Tools::FloorLog2(this->n);
58 this->b = log2n * log2n;
62 ulong firstKey = 0, lastKey;
64 while (!Tools::GetField(B, 1, firstKey))
67 // Temp array of BSGAP structures
68 ulong **tempP = new ulong * [n / b + 1];
69 ulong *bitsInBSD = new ulong [n / b + 1];
71 this->firstKeyA = new BlockArray(n/b + 1, Tools::CeilLog2(this->u));
73 for (ulong i = 0; i < n/b; i ++)
77 for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++)
78 if (Tools::GetField(B, 1, lastKey))
84 cout << "error: lastKey = " << lastKey << ", u = " << u << endl;
88 // lastKey of the last substructure is this->u
89 if (i == n/b - 1 && n%b == 0)
92 tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]); // FIXME: left of right subtree?
93 totalBits += bitsInBSD[i];
94 (*firstKeyA)[i] = firstKey;
99 if (n - (n/b) * b != 0)
101 // Find the first key
102 while (!Tools::GetField(B, 1, firstKey))
105 tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]); // FIXME: left of right subtree?
106 totalBits += bitsInBSD[n/b];
107 (*firstKeyA)[n/b] = firstKey;
111 this->topCount = n%b ? n/b + 1: n/b;
113 // Catenate binary trees into one bitvector
114 this->P = new ulong[totalBits/W + 1];
115 this->bitsInP = totalBits;
117 for (ulong i = 0; i < this->topCount ; ++i)
119 BitCopy(P, tempP[i], offset, bitsInBSD[i]);
121 offset += bitsInBSD[i];
125 // Pointers to bit vector P (binary tree starting offsets)
126 // FIXME Use more succinct representation?
128 this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits));
129 for (ulong i = 0; i < this->topCount ; ++i)
131 (*offsetA)[i] = offset;
132 offset += bitsInBSD[i];
147 ulong BSGAP::rank(ulong i)
149 if (n == 0) // Trivial case
152 throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
153 if ((*firstKeyA)[0] > i)
155 ulong l = 0, r = topCount - 1;
156 while (l < r) // Binary search on the array firstKeyA
158 ulong mid = l + (r - l)/2lu;
159 if ((*firstKeyA)[mid] < i)
164 if ((*firstKeyA)[l] > i && l == 0)
166 if ((*firstKeyA)[l] == i)
168 if ((*firstKeyA)[l] > i)
171 ulong lastKey, firstKey = (*firstKeyA)[l];
172 if (l == topCount - 1)
175 lastKey = (*firstKeyA)[l + 1];
177 if (l == topCount - 1 && n%b)
180 ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
181 // Add rank of substructures P[0], P[1], ..., P[j - 2]
185 ulong BSGAP::rank0(ulong i)
187 return i - rank(i) + 1;
190 bool BSGAP::IsBitSet(ulong i)
192 if (n == 0) // Trivial case
195 return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
196 if ((*firstKeyA)[0] > i)
198 ulong l = 0, r = topCount - 1;
199 while (l < r) // Binary search on the array firstKeyA
201 ulong mid = l + (r - l)/2lu;
202 if ((*firstKeyA)[mid] < i)
207 if ((*firstKeyA)[l] > i && l == 0)
209 if ((*firstKeyA)[l] == i)
211 if ((*firstKeyA)[l] > i)
214 ulong lastKey, firstKey = (*firstKeyA)[l];
215 if (l == topCount - 1)
218 lastKey = (*firstKeyA)[l + 1];
220 if (l == topCount - 1 && n%b)
223 return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
227 * Returns also rank_1(i) via the second parameter.
229 bool BSGAP::IsBitSet(ulong i, ulong *resultRank)
232 if (n == 0) // Trivial case
237 return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
239 if ((*firstKeyA)[0] > i)
241 ulong l = 0, r = topCount - 1;
242 while (l < r) // Binary search on the array firstKeyA
244 ulong mid = l + (r - l)/2lu;
245 if ((*firstKeyA)[mid] < i)
250 if ((*firstKeyA)[l] > i && l == 0)
252 if ((*firstKeyA)[l] == i)
254 *resultRank = 1 + l * b;
257 if ((*firstKeyA)[l] > i)
260 ulong lastKey, firstKey = (*firstKeyA)[l];
261 if (l == topCount - 1)
264 lastKey = (*firstKeyA)[l + 1];
266 if (l == topCount - 1 && n%b)
269 bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank);
270 *resultRank += l * b;
275 ulong BSGAP::select(ulong i)
282 throw std::out_of_range("BSGAP::select(): Parameter i was out of range");
287 ulong lastKey, firstKey = (*firstKeyA)[j];
288 if (j == topCount - 1)
291 lastKey = (*firstKeyA)[j + 1];
293 if (j == topCount - 1 && n%b)
296 return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true);
299 ulong BSGAP::select0(ulong i)
301 if (n == 0) // Trivial case
304 throw std::out_of_range("BSGAP::select0(): Parameter i was out of range");
306 ulong l = 0, r = topCount - 1;
307 while (l < r) // Binary search on the array firstKeyA
309 ulong mid = l + (r - l)/2lu;
310 if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid]
315 if (l == 0 && (*firstKeyA)[l] >= i)
317 if (l > 0 && (*firstKeyA)[l] - l*b >= i)
320 i -= (*firstKeyA)[l] - l*b;
322 ulong lastKey, firstKey = (*firstKeyA)[l];
323 if (l == topCount - 1)
326 lastKey = (*firstKeyA)[l + 1];
328 if (l == topCount - 1 && n%b)
331 return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
334 ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
341 // Check if subtree is full --- FIXME Make this test an assert()!
342 if (IsSubtreeFull(firstKey, l, r, n))
343 throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i);
347 ulong x = DeltaDecode(B, offset);
353 if (numberOfZeros(firstKey, l, key, n) < i)
355 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
356 i = i - numberOfZeros(firstKey, l, key, n);
363 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
372 ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
374 // Number of keys in subtrees on left-side
383 // Check if subtree is full
384 if (IsSubtreeFull(firstKey, l, r, n))
385 return result + (l == firstKey ? i - l + 1 : i - l);
389 ulong x = DeltaDecode(B, offset);
395 return result + n/2 + 1;
399 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
407 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
416 bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
418 // Number of keys in subtrees on left-side
427 // Check if subtree is full
428 if (IsSubtreeFull(firstKey, l, r, n))
429 return result + (l == firstKey ? i - l + 1 : i - l);
433 ulong x = DeltaDecode(B, offset);
443 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
451 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
459 // Returns also the rank of i
460 bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank)
462 // Number of keys in subtrees on left-side
470 *resultRank = result;
474 // Check if subtree is full
475 if (IsSubtreeFull(firstKey, l, r, n))
477 *resultRank = result + (l == firstKey ? i - l + 1 : i - l);
483 ulong x = DeltaDecode(B, offset);
490 *resultRank = result + n/2 + 1;
496 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
504 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
513 ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
520 // Check if subtree is full
521 if (IsSubtreeFull(firstKey, l, r, n))
522 return (l == firstKey ? l + i - 1: l + i);
526 ulong x = DeltaDecode(B, offset);
535 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
543 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
551 ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits)
557 // Check if subtree is full
558 if (IsSubtreeFull(firstKey, l, r, n))
562 ulong key = FindMedian(B, firstKey, l, r, n);
564 ulong *leftSub, *rightSub, leftBits, rightBits;
565 leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits);
566 rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits);
569 ulong *keyDelta, keyBits;
571 keyDelta = DeltaEncode(r - key, keyBits, true);
573 keyDelta = DeltaEncode(key - l, keyBits, true);
575 // Encode jump offset if left and right subtrees exists
576 ulong *jumpOffset = 0, jumpBits = 0;
577 if (leftBits != 0 && rightBits != 0)
578 jumpOffset = DeltaEncode(leftBits, jumpBits);
580 // bits is the sum of keyBits, jumpBits, leftBits and rightBits
581 bits = keyBits + jumpBits + leftBits + rightBits;
582 ulong *output = new ulong[bits / W + 1];
587 BitCopy(output, keyDelta, 0, keyBits);
589 ulong offset = keyBits;
593 BitCopy(output, jumpOffset, offset, jumpBits);
595 delete [] jumpOffset;
598 BitCopy(output, leftSub, offset, leftBits);
600 BitCopy(output, rightSub, offset, rightBits);
603 assert(offset == bits);
611 void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len)
619 Tools::SetVariableField(dest, len, offset, *src);
623 for (i = 0; i < len/W; i ++)
624 Tools::SetVariableField(dest, W, offset + i * W, src[i]);
627 Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]);
630 ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n)
632 // Linear scan: slow but affects only the construction time
635 l ++; // Skip left boundary
639 if (Tools::GetField(B, 1, l))
646 ulong BSGAP::DeltaDecode(ulong *B, ulong &offset)
649 // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
650 const unsigned s = 180; // FIXME These should be class variables
651 const unsigned c = 256 - s; // FIXME Choose <s,c> values!
652 const unsigned b = 8; // one byte.
657 while (Tools::GetVariableField(B, b, offset) < c)
659 i = i*c + Tools::GetVariableField(B, b, offset);
664 i = i * s + (Tools::GetVariableField(B, b, offset) - c);
670 ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative)
673 // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
674 const unsigned s = 180; // FIXME These should be class variables
675 const unsigned c = 256 - s; // FIXME Choose <s,c> values!
676 const unsigned b = 8; // one byte
678 // Calculate size first:
688 // bits is now the length of the whole codeword
689 ulong *B = new ulong[bits/W + 1];
691 // output codeword (backwards):
692 unsigned offset = bits - b;
693 ulong output = c + (value % s);
694 Tools::SetVariableField(B, b, offset, output);
703 Tools::SetVariableField(B, b, offset, output);
713 * Saving the following data fields:
714 * ulong u, n; // Universe size, number of 1-bits in B
715 * ulong topCount; // Top structure and the number of substructures
717 * ulong *P; // Pointer to BSGAP structures
718 * unsigned b; // Subdictionary size (\log^2 n)
719 * BlockArray *offsetA; // Array of pointers (into bitvector P)
720 * BlockArray *firstKeyA; // Array of first key positions of the substructures
722 void BSGAP::Save(FILE *file) const
724 if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
725 throw std::runtime_error("BSGAP::Save(): file write error (u).");
727 if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
728 throw std::runtime_error("BSGAP::Save(): file write error (n).");
730 if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1)
731 throw std::runtime_error("BSGAP::Save(): file write error (topCount).");
733 if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
734 throw std::runtime_error("BSGAP::Save(): file write error (bitsInP).");
736 for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
738 if (std::fwrite(this->P+offset, sizeof(ulong), 1, file) != 1)
739 throw std::runtime_error("BSGAP::Save(): file write error (P).");
742 if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1)
743 throw std::runtime_error("BSGAP::Save(): file write error (b).");
746 firstKeyA->Save(file);
752 BSGAP::BSGAP(FILE *file)
754 if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
755 throw std::runtime_error("BSGAP::Load(): file read error (u).");
757 if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
758 throw std::runtime_error("BSGAP::Load(): file read error (n).");
760 if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1)
761 throw std::runtime_error("BSGAP::Load(): file read error (topCount).");
763 if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
764 throw std::runtime_error("BSGAP::Load(): file read error (bitsInP).");
766 P = new ulong[bitsInP/W+1];
767 for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
769 if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1)
770 throw std::runtime_error("BSGAP::Load(): file read error (P).");
773 if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1)
774 throw std::runtime_error("BSGAP::Load(): file read error (b).");
776 offsetA = new BlockArray(file);
777 firstKeyA = new BlockArray(file);