2 * Binary-searchable gap encoding scheme (BSGAP)
5 * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp
9 * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures:
10 * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007).
19 const uchar BSGAP::bit_table[] = {
20 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,
21 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0,
22 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,
23 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,
24 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,
25 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,
26 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1};
29 * samplerate == number of keys in each gap encoded binary search tree
30 * default value is log^2 n where n is the number of keys.
32 BSGAP::BSGAP(ulong *B, ulong u_, bool freeB, ulong sampleRate)
33 : u(u_), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0)
36 for (ulong i = 0; i < u; i ++)
37 if (Tools::GetField(B, 1, i))
40 if (n == 0) // Sanity check
48 unsigned log2n = Tools::FloorLog2(this->n);
52 this->b = log2n * log2n;
56 ulong firstKey = 0, lastKey;
58 while (!Tools::GetField(B, 1, firstKey))
61 // Temp array of BSGAP structures
62 ulong **tempP = new ulong * [n / b + 1];
63 ulong *bitsInBSD = new ulong [n / b + 1];
66 this->topCount = n%b ? n/b + 1: n/b;
67 this->firstKeyA = new BlockArray(topCount, Tools::CeilLog2(this->u));
69 for (ulong i = 0; i < n/b; i ++)
73 for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++)
74 if (Tools::GetField(B, 1, lastKey))
80 cout << "error: lastKey = " << lastKey << ", u = " << u << endl;
84 // lastKey of the last substructure is this->u
85 if (i == n/b - 1 && n%b == 0)
88 tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]);
89 totalBits += bitsInBSD[i];
90 (*firstKeyA)[i] = firstKey;
95 if (n - (n/b) * b != 0)
98 while (!Tools::GetField(B, 1, firstKey))
101 tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]);
102 totalBits += bitsInBSD[n/b];
103 (*firstKeyA)[n/b] = firstKey;
106 // Catenate binary trees into one bitvector
107 this->P = new ulong[totalBits/W + 1];
108 for (ulong i = 0; i < totalBits/W + 1; ++i)
110 this->bitsInP = totalBits;
112 for (ulong i = 0; i < this->topCount ; ++i)
114 BitCopy(P, tempP[i], offset, bitsInBSD[i]);
116 offset += bitsInBSD[i];
120 // Pointers to bit vector P (binary tree starting offsets)
121 // FIXME Use more succinct representation?
123 this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits));
124 for (ulong i = 0; i < this->topCount ; ++i)
126 (*offsetA)[i] = offset;
127 offset += bitsInBSD[i];
142 ulong BSGAP::rank(ulong i)
144 if (n == 0) // Trivial case
147 throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
148 if ((*firstKeyA)[0] > i)
150 ulong l = 0, r = topCount - 1;
151 while (l < r) // Binary search on the array firstKeyA
153 ulong mid = l + (r - l)/2lu;
154 if ((*firstKeyA)[mid] < i)
159 if ((*firstKeyA)[l] > i && l == 0)
161 if ((*firstKeyA)[l] == i)
163 if ((*firstKeyA)[l] > i)
166 ulong lastKey, firstKey = (*firstKeyA)[l];
167 if (l == topCount - 1)
170 lastKey = (*firstKeyA)[l + 1];
172 if (l == topCount - 1 && n%b)
175 ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
176 // Add rank of substructures P[0], P[1], ..., P[j - 2]
180 ulong BSGAP::rank0(ulong i)
182 return i - rank(i) + 1;
185 bool BSGAP::IsBitSet(ulong i)
187 if (n == 0) // Trivial case
190 return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
191 if ((*firstKeyA)[0] > i)
193 ulong l = 0, r = topCount - 1;
194 while (l < r) // Binary search on the array firstKeyA
196 ulong mid = l + (r - l)/2lu;
197 if ((*firstKeyA)[mid] < i)
202 if ((*firstKeyA)[l] > i && l == 0)
204 if ((*firstKeyA)[l] == i)
206 if ((*firstKeyA)[l] > i)
209 ulong lastKey, firstKey = (*firstKeyA)[l];
210 if (l == topCount - 1)
213 lastKey = (*firstKeyA)[l + 1];
215 if (l == topCount - 1 && n%b)
218 return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
222 * Returns also rank_1(i) via the second parameter.
224 bool BSGAP::IsBitSet(ulong i, ulong *resultRank)
227 if (n == 0) // Trivial case
232 return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range");
234 if ((*firstKeyA)[0] > i)
236 ulong l = 0, r = topCount - 1;
237 while (l < r) // Binary search on the array firstKeyA
239 ulong mid = l + (r - l)/2lu;
240 if ((*firstKeyA)[mid] < i)
245 if ((*firstKeyA)[l] > i && l == 0)
247 if ((*firstKeyA)[l] == i)
249 *resultRank = 1 + l * b;
252 if ((*firstKeyA)[l] > i)
255 ulong lastKey, firstKey = (*firstKeyA)[l];
256 if (l == topCount - 1)
259 lastKey = (*firstKeyA)[l + 1];
261 if (l == topCount - 1 && n%b)
264 bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank);
265 *resultRank += l * b;
270 ulong BSGAP::select(ulong i)
277 throw std::out_of_range("BSGAP::select(): Parameter i was out of range");
282 ulong lastKey, firstKey = (*firstKeyA)[j];
283 if (j == topCount - 1)
286 lastKey = (*firstKeyA)[j + 1];
288 if (j == topCount - 1 && n%b)
291 return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true);
294 ulong BSGAP::select0(ulong i)
296 if (n == 0) // Trivial case
299 throw std::out_of_range("BSGAP::select0(): Parameter i was out of range");
301 ulong l = 0, r = topCount - 1;
302 while (l < r) // Binary search on the array firstKeyA
304 ulong mid = l + (r - l)/2lu;
305 if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid]
310 if (l == 0 && (*firstKeyA)[l] >= i)
312 if (l > 0 && (*firstKeyA)[l] - l*b >= i)
315 i -= (*firstKeyA)[l] - l*b;
317 ulong lastKey, firstKey = (*firstKeyA)[l];
318 if (l == topCount - 1)
321 lastKey = (*firstKeyA)[l + 1];
323 if (l == topCount - 1 && n%b)
326 return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true);
329 ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
336 // Check if subtree is full --- FIXME Make this test an assert()!
337 if (IsSubtreeFull(firstKey, l, r, n))
338 throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i);
342 ulong x = DeltaDecode(B, offset);
348 if (numberOfZeros(firstKey, l, key, n) < i)
350 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
351 i = i - numberOfZeros(firstKey, l, key, n);
358 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
367 ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
369 // Number of keys in subtrees on left-side
378 // Check if subtree is full
379 if (IsSubtreeFull(firstKey, l, r, n))
380 return result + (l == firstKey ? i - l + 1 : i - l);
384 ulong x = DeltaDecode(B, offset);
390 return result + n/2 + 1;
394 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
402 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
411 bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
413 // Number of keys in subtrees on left-side
422 // Check if subtree is full
423 if (IsSubtreeFull(firstKey, l, r, n))
424 return result + (l == firstKey ? i - l + 1 : i - l);
428 ulong x = DeltaDecode(B, offset);
438 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
446 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
454 // Returns also the rank of i
455 bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank)
457 // Number of keys in subtrees on left-side
465 *resultRank = result;
469 // Check if subtree is full
470 if (IsSubtreeFull(firstKey, l, r, n))
472 *resultRank = result + (l == firstKey ? i - l + 1 : i - l);
478 ulong x = DeltaDecode(B, offset);
485 *resultRank = result + n/2 + 1;
491 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
499 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
508 ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild)
515 // Check if subtree is full
516 if (IsSubtreeFull(firstKey, l, r, n))
517 return (l == firstKey ? l + i - 1: l + i);
521 ulong x = DeltaDecode(B, offset);
530 offset = FindRightSubtree(B, offset, firstKey, key, l, r, n);
538 offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n);
546 ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits)
552 // Check if subtree is full
553 if (IsSubtreeFull(firstKey, l, r, n))
557 ulong key = FindMedian(B, firstKey, l, r, n);
559 ulong *leftSub, *rightSub, leftBits, rightBits;
560 leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits);
561 rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits);
564 ulong *keyDelta, keyBits;
566 keyDelta = DeltaEncode(r - key, keyBits, true);
568 keyDelta = DeltaEncode(key - l, keyBits, true);
570 // Encode jump offset if left and right subtrees exists
571 ulong *jumpOffset = 0, jumpBits = 0;
572 if (leftBits != 0 && rightBits != 0)
573 jumpOffset = DeltaEncode(leftBits, jumpBits);
575 // bits is the sum of keyBits, jumpBits, leftBits and rightBits
576 bits = keyBits + jumpBits + leftBits + rightBits;
577 ulong *output = new ulong[bits / W + 1];
578 for (ulong i = 0; i < bits/W+1; ++i)
584 BitCopy(output, keyDelta, 0, keyBits);
586 ulong offset = keyBits;
590 BitCopy(output, jumpOffset, offset, jumpBits);
592 delete [] jumpOffset;
595 BitCopy(output, leftSub, offset, leftBits);
597 BitCopy(output, rightSub, offset, rightBits);
600 assert(offset == bits);
608 void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len)
616 Tools::SetVariableField(dest, len, offset, *src);
620 for (i = 0; i < len/W; i ++)
621 Tools::SetVariableField(dest, W, offset + i * W, src[i]);
624 Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]);
627 ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n)
629 // Linear scan: slow but affects only the construction time
632 l ++; // Skip left boundary
636 if (Tools::GetField(B, 1, l))
643 ulong BSGAP::DeltaDecode(ulong *B, ulong &offset)
646 // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
647 const unsigned s = 180; // FIXME These should be class variables
648 const unsigned c = 256 - s; // FIXME Choose <s,c> values!
649 const unsigned b = 8; // one byte.
654 while (Tools::GetVariableField(B, b, offset) < c)
656 i = i*c + Tools::GetVariableField(B, b, offset);
661 i = i * s + (Tools::GetVariableField(B, b, offset) - c);
667 ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative)
670 // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf
671 const unsigned s = 180; // FIXME These should be class variables
672 const unsigned c = 256 - s; // FIXME Choose <s,c> values!
673 const unsigned b = 8; // one byte
675 // Calculate size first:
685 // bits is now the length of the whole codeword
686 ulong *B = new ulong[bits/W + 1];
688 // output codeword (backwards):
689 unsigned offset = bits - b;
690 ulong output = c + (value % s);
691 Tools::SetVariableField(B, b, offset, output);
700 Tools::SetVariableField(B, b, offset, output);
710 * Saving the following data fields:
711 * ulong u, n; // Universe size, number of 1-bits in B
712 * ulong topCount; // Top structure and the number of substructures
714 * ulong *P; // Pointer to BSGAP structures
715 * unsigned b; // Subdictionary size (\log^2 n)
716 * BlockArray *offsetA; // Array of pointers (into bitvector P)
717 * BlockArray *firstKeyA; // Array of first key positions of the substructures
719 void BSGAP::Save(FILE *file) const
721 if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1)
722 throw std::runtime_error("BSGAP::Save(): file write error (u).");
724 if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1)
725 throw std::runtime_error("BSGAP::Save(): file write error (n).");
730 if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1)
731 throw std::runtime_error("BSGAP::Save(): file write error (topCount).");
733 if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
734 throw std::runtime_error("BSGAP::Save(): file write error (bitsInP).");
736 for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
738 if (std::fwrite((this->P)+offset, sizeof(ulong), 1, file) != 1)
739 throw std::runtime_error("BSGAP::Save(): file write error (P).");
742 if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1)
743 throw std::runtime_error("BSGAP::Save(): file write error (b).");
746 firstKeyA->Save(file);
752 BSGAP::BSGAP(FILE *file)
753 : u(0), n(0), topCount(0), bitsInP(0), P(0), b(0), offsetA(0), firstKeyA(0)
755 if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1)
756 throw std::runtime_error("BSGAP::Load(): file read error (u).");
758 if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1)
759 throw std::runtime_error("BSGAP::Load(): file read error (n).");
764 if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1)
765 throw std::runtime_error("BSGAP::Load(): file read error (topCount).");
767 if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1)
768 throw std::runtime_error("BSGAP::Load(): file read error (bitsInP).");
770 P = new ulong[bitsInP/W+1];
771 for (ulong offset = 0; offset < bitsInP/W+1; offset ++)
773 if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1)
774 throw std::runtime_error("BSGAP::Load(): file read error (P).");
777 if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1)
778 throw std::runtime_error("BSGAP::Load(): file read error (b).");
780 offsetA = new BlockArray(file);
781 firstKeyA = new BlockArray(file);