X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=BSGAP.h;fp=BSGAP.h;h=a1fba5ddca971c16dc73a113bcda74086cab2765;hb=dbd62fe27e80414b4cf4fbf512ad1910916e6967;hp=0000000000000000000000000000000000000000;hpb=dee0161e4f31f06e9389db9986766395c1b1d2b8;p=SXSI%2FTextCollection.git diff --git a/BSGAP.h b/BSGAP.h new file mode 100644 index 0000000..a1fba5d --- /dev/null +++ b/BSGAP.h @@ -0,0 +1,119 @@ +/** + * Binary-searchable gap encoding scheme (BSGAP) + * Niko Välimäki + * + * Based on: + * + * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures: + * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007). + */ +#ifndef _BSGAP_H_ +#define _BSGAP_H_ + +#include "Tools.h" +#include "BlockArray.h" + +class BSGAP +{ +public: + BSGAP(ulong *, ulong, bool = false, ulong = 0); + BSGAP(FILE *); + ~BSGAP(); + ulong rank(ulong); + ulong rank0(ulong); + ulong rank(bool b, ulong i) {return b?rank(i):i+1-rank(i);}; + ulong select(ulong); + ulong select0(ulong); + ulong select(bool b, ulong i) {return b?select(i):select0(i);}; + + bool IsBitSet(ulong pos); + // Returns also the rank_1(pos) via the second param + bool IsBitSet(ulong pos, ulong *); + + ulong size() {return u;}; + ulong bitsSet() {return n;} + void Save(FILE *) const; +private: + static const uchar bit_table[]; + ulong u, n; // Universe size, number of 1-bits in B + ulong topCount; // Top structure and the number of substructures + ulong bitsInP; // Length of bit vector P + ulong *P; // Pointer to BSGAP structures + unsigned b; // Subdictionary size (\log^2 n) + BlockArray *offsetA; // Array of pointers (into bitvector P) + BlockArray *firstKeyA; // Array of first key positions of the substructures + + ulong rank(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool, ulong *); + ulong select(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + ulong select0(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + ulong * GetSubtree(ulong *, ulong, ulong, ulong, ulong, bool, ulong &); + ulong DeltaDecode(ulong *, ulong &); + ulong DeltaDecode(ulong *, ulong &, bool &); + ulong * DeltaEncode(ulong, ulong &, bool = true, bool = false); + void BitCopy(ulong *, ulong *, ulong, ulong); + ulong FindMedian(ulong *, ulong, ulong, ulong, ulong); + unsigned FindLowestBit(ulong); + + inline bool IsSubtreeFull(ulong firstKey, ulong l, ulong r, ulong n) + { + if (l == firstKey) + n --; + + if (r - l - 1 == n) + return true; + return false; + } + /** + * Returns the number of zeros in the left subtree + */ + inline ulong numberOfZeros(ulong firstKey, ulong l, ulong key, ulong n) + { + ulong z = key - l - 1 - n/2; + if (l == firstKey) + ++z; + if (z == -1lu) + z = 0; + return z; + } + + inline ulong FindRightSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n) + { + if (n - n/2 - 1 == 0) + return 0; // Does not exists + + if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1)) + return 0; // Right subtree is full (does not exists) + + if (n/2 == 0) + return offset; // Left subtree does not exists + + if (IsSubtreeFull(firstKey, l, key, n/2)) + return offset; // Left subtree is full (does not exists) + + ulong len = DeltaDecode(B, offset); + return offset + len; // Skipping left subtree + } + + inline ulong FindLeftSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n) + { + if (n/2 == 0) + return 0; // Does not exists + + if (IsSubtreeFull(firstKey, l, key, n/2)) + return 0; // Left subtree is full + + if (n - n/2 - 1 == 0) + return offset; // Right subtree does not exists + + if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1)) + return offset; // Right subtree is full + + DeltaDecode(B, offset); // Calculate new offset + return offset; // Left subtree is the first one + } + +}; + +#endif