From dbd62fe27e80414b4cf4fbf512ad1910916e6967 Mon Sep 17 00:00:00 2001 From: nvalimak Date: Sat, 7 Feb 2009 17:00:01 +0000 Subject: [PATCH] Update: libcds WT, empty texts in bitv. git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@138 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- BSGAP.cpp | 779 ++++++++++++++++++++++++++++++++++++++++++++++ BSGAP.h | 119 +++++++ BitRank.cpp | 62 +++- BitRank.h | 7 +- BlockArray.h | 40 ++- CSA.cpp | 535 +++++++++++++++++++------------ CSA.h | 98 +++++- GapEncode.cpp | 151 +++++++++ GapEncode.h | 113 +++++++ RLWaveletTree.cpp | 383 +++++++++++++++++++++++ RLWaveletTree.h | 64 ++++ TextCollection.h | 8 +- Tools.h | 5 + makefile | 6 +- 14 files changed, 2136 insertions(+), 234 deletions(-) create mode 100644 BSGAP.cpp create mode 100644 BSGAP.h create mode 100644 GapEncode.cpp create mode 100644 GapEncode.h create mode 100644 RLWaveletTree.cpp create mode 100644 RLWaveletTree.h diff --git a/BSGAP.cpp b/BSGAP.cpp new file mode 100644 index 0000000..b2db287 --- /dev/null +++ b/BSGAP.cpp @@ -0,0 +1,779 @@ +/** + * Binary-searchable gap encoding scheme (BSGAP) + * Niko Välimäki + * + * Compile: g++ -Wall -o testBSGAP BSGAP.cpp Tools.cpp + * + * Based on: + * + * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures: + * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007). + */ + +#include "BSGAP.h" +#include +#include +using namespace std; + + +const uchar BSGAP::bit_table[] = { + 0,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0, + 1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,6,0,1,0,2,0,1,0,3,0,1,0, + 2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0, + 1,0,2,0,1,0,3,0,1,0,2,0,1,0,7,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0, + 3,0,1,0,2,0,1,0,5,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0, + 1,0,6,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1,0,5,0,1,0, + 2,0,1,0,3,0,1,0,2,0,1,0,4,0,1,0,2,0,1,0,3,0,1,0,2,0,1}; + +/** + * samplerate == number of keys in each gap encoded binary search tree + * default value is log^2 n where n is the number of keys. + */ +BSGAP::BSGAP(ulong *B, ulong u, bool freeB, ulong sampleRate) +{ + this->u = u; + this->n = 0; + this->P = 0; + this->offsetA = 0; + this->firstKeyA = 0; + this->topCount = 0; + + // Count keys + for (ulong i = 0; i < u; i ++) + if (Tools::GetField(B, 1, i)) + this->n ++; + + if (n == 0) // Sanity check + { + if (freeB && B) + delete [] B; + + return; + } + + unsigned log2n = Tools::FloorLog2(this->n); + if (log2n < 2) + log2n = 2; + if (sampleRate == 0) + this->b = log2n * log2n; + else + this->b = sampleRate; + + ulong firstKey = 0, lastKey; + // Find the first key + while (!Tools::GetField(B, 1, firstKey)) + firstKey ++; + + // Temp array of BSGAP structures + ulong **tempP = new ulong * [n / b + 1]; + ulong *bitsInBSD = new ulong [n / b + 1]; + ulong totalBits = 0; + this->firstKeyA = new BlockArray(n/b + 1, Tools::CeilLog2(this->u)); + + for (ulong i = 0; i < n/b; i ++) + { + // Find the last key + lastKey = firstKey; + for (ulong k = 0; k < b + 1 && lastKey < u; lastKey ++) + if (Tools::GetField(B, 1, lastKey)) + k ++; + lastKey --; + // Sanity check + if (lastKey > u) + { + cout << "error: lastKey = " << lastKey << ", u = " << u << endl; + exit(0); + } + + // lastKey of the last substructure is this->u + if (i == n/b - 1 && n%b == 0) + lastKey = u; + + tempP[i] = GetSubtree(B, firstKey, firstKey, lastKey, b, true, bitsInBSD[i]); // FIXME: left of right subtree? + totalBits += bitsInBSD[i]; + (*firstKeyA)[i] = firstKey; + + firstKey = lastKey; + } + + if (n - (n/b) * b != 0) + { + // Find the first key + while (!Tools::GetField(B, 1, firstKey)) + firstKey ++; + + tempP[n/b] = GetSubtree(B, firstKey, firstKey, u, n - (n/b) * b, true, bitsInBSD[n/b]); // FIXME: left of right subtree? + totalBits += bitsInBSD[n/b]; + (*firstKeyA)[n/b] = firstKey; + } + + // Number of nodes: + this->topCount = n%b ? n/b + 1: n/b; + + // Catenate binary trees into one bitvector + this->P = new ulong[totalBits/W + 1]; + this->bitsInP = totalBits; + ulong offset = 0; + for (ulong i = 0; i < this->topCount ; ++i) + { + BitCopy(P, tempP[i], offset, bitsInBSD[i]); + delete [] tempP[i]; + offset += bitsInBSD[i]; + } + delete [] tempP; + + // Pointers to bit vector P (binary tree starting offsets) + // FIXME Use more succinct representation? + offset = 0; + this->offsetA = new BlockArray(this->topCount, Tools::CeilLog2(totalBits)); + for (ulong i = 0; i < this->topCount ; ++i) + { + (*offsetA)[i] = offset; + offset += bitsInBSD[i]; + } + delete [] bitsInBSD; + + if (freeB) + delete [] B; +} + +BSGAP::~BSGAP() +{ + delete [] P; + delete offsetA; + delete firstKeyA; +} + +ulong BSGAP::rank(ulong i) +{ + if (n == 0) // Trivial case + return 0; + if (i >= u) + throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); + if ((*firstKeyA)[0] > i) + return 0; + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] < i) + l = mid + 1; + else + r = mid; + } + if ((*firstKeyA)[l] > i && l == 0) + return 0; + if ((*firstKeyA)[l] == i) + return 1 + l * b; + if ((*firstKeyA)[l] > i) + l --; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + ulong k = rank(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); + // Add rank of substructures P[0], P[1], ..., P[j - 2] + return l * b + k; +} + +ulong BSGAP::rank0(ulong i) +{ + return i - rank(i) + 1; +} + +bool BSGAP::IsBitSet(ulong i) +{ + if (n == 0) // Trivial case + return false; + if (i >= u) + return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); + if ((*firstKeyA)[0] > i) + return false; + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] < i) + l = mid + 1; + else + r = mid; + } + if ((*firstKeyA)[l] > i && l == 0) + return false; + if ((*firstKeyA)[l] == i) + return true; + if ((*firstKeyA)[l] > i) + l --; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + return IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); +} + +/** + * Returns also rank_1(i) via the second parameter. + */ +bool BSGAP::IsBitSet(ulong i, ulong *resultRank) +{ + *resultRank = 0; + if (n == 0) // Trivial case + return false; + if (i >= u) + { + *resultRank = n; + return false; // FIXME throw std::out_of_range("BSGAP::rank(): Parameter i was out of range"); + } + if ((*firstKeyA)[0] > i) + return false; + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] < i) + l = mid + 1; + else + r = mid; + } + if ((*firstKeyA)[l] > i && l == 0) + return false; + if ((*firstKeyA)[l] == i) + { + *resultRank = 1 + l * b; + return true; + } + if ((*firstKeyA)[l] > i) + l --; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + bool result = IsBitSet(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true, resultRank); + *resultRank += l * b; + return result; +} + + +ulong BSGAP::select(ulong i) +{ + if (n == 0) + return 0; + if (i == 0) + return 0; + if (i > n) + throw std::out_of_range("BSGAP::select(): Parameter i was out of range"); + + ulong j = i/b; + if (i%b == 0) + j --; + ulong lastKey, firstKey = (*firstKeyA)[j]; + if (j == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[j + 1]; + ulong nSub = b; + if (j == topCount - 1 && n%b) + nSub = n%b; + + return select(P, (*offsetA)[j], firstKey, i - j * b, firstKey, lastKey, nSub, true); +} + +ulong BSGAP::select0(ulong i) +{ + if (n == 0) // Trivial case + return i-1; + if (i > u - n) + throw std::out_of_range("BSGAP::select0(): Parameter i was out of range"); + + ulong l = 0, r = topCount - 1; + while (l < r) // Binary search on the array firstKeyA + { + ulong mid = l + (r - l)/2lu; + if ((*firstKeyA)[mid] - mid*b < i) // Number of zeros before [mid] + l = mid + 1; + else + r = mid; + } + if (l == 0 && (*firstKeyA)[l] >= i) + return i-1; + if (l > 0 && (*firstKeyA)[l] - l*b >= i) + --l; // FIXME: TEST + + i -= (*firstKeyA)[l] - l*b; + + ulong lastKey, firstKey = (*firstKeyA)[l]; + if (l == topCount - 1) + lastKey = u; + else + lastKey = (*firstKeyA)[l + 1]; + ulong nSub = b; + if (l == topCount - 1 && n%b) + nSub = n%b; + + return select0(P, (*offsetA)[l], firstKey, i, firstKey, lastKey, nSub, true); +} + +ulong BSGAP::select0(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + while (1) + { + if (n == 0) + return l+i; + + // Check if subtree is full --- FIXME Make this test an assert()! + if (IsSubtreeFull(firstKey, l, r, n)) + throw runtime_error("BSGAP::select0(): subtree was full"); // Should not happen // (l == firstKey ? l + i - 1: l + i); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + + if (numberOfZeros(firstKey, l, key, n) < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + i = i - numberOfZeros(firstKey, l, key, n); + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + + +ulong BSGAP::rank(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + // Number of keys in subtrees on left-side + ulong result = 0; + + while (1) + { + + if (n == 0) + return result; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return result + (l == firstKey ? i - l + 1 : i - l); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (key == i) + return result + n/2 + 1; + + if (key < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + result += n/2 + 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + + +bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + // Number of keys in subtrees on left-side + ulong result = 0; + + while (1) + { + + if (n == 0) + return false; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return result + (l == firstKey ? i - l + 1 : i - l); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (key == i) + return true; + + if (key < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + result += n/2 + 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + +// Returns also the rank of i +bool BSGAP::IsBitSet(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild, ulong *resultRank) +{ + // Number of keys in subtrees on left-side + ulong result = 0; + + while (1) + { + + if (n == 0) + { + *resultRank = result; + return false; + } + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + { + *resultRank = result + (l == firstKey ? i - l + 1 : i - l); + return true; + } + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (key == i) + { + *resultRank = result + n/2 + 1; + return true; + } + + if (key < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + result += n/2 + 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + + +ulong BSGAP::select(ulong *B, ulong offset, ulong firstKey, ulong i, ulong l, ulong r, ulong n, bool leftChild) +{ + while (1) + { + if (n == 0) + return 0; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return (l == firstKey ? l + i - 1: l + i); + + // Parse key value + ulong key; + ulong x = DeltaDecode(B, offset); + if (leftChild) + key = r - x; + else + key = l + x; + if (n/2 + 1 == i) + return key; + if (n/2 + 1 < i) + { + offset = FindRightSubtree(B, offset, firstKey, key, l, r, n); + i = i - n/2 - 1; + l = key; + n = n - n/2 - 1; + leftChild = false; + } + else + { + offset = FindLeftSubtree(B, offset, firstKey, key, l, r, n); + r = key; + n = n/2; + leftChild = true; + } + } +} + +ulong * BSGAP::GetSubtree(ulong *B, ulong firstKey, ulong l, ulong r, ulong n, bool leftChild, ulong &bits) +{ + bits = 0; + if (n == 0) + return 0; + + // Check if subtree is full + if (IsSubtreeFull(firstKey, l, r, n)) + return 0; + + // Pick the median + ulong key = FindMedian(B, firstKey, l, r, n); + + ulong *leftSub, *rightSub, leftBits, rightBits; + leftSub = GetSubtree(B, firstKey, l, key, n/2, true, leftBits); + rightSub = GetSubtree(B, firstKey, key, r, n - n/2 - 1, false, rightBits); + + // Encode key value + ulong *keyDelta, keyBits; + if (leftChild) + keyDelta = DeltaEncode(r - key, keyBits, true); + else + keyDelta = DeltaEncode(key - l, keyBits, true); + + // Encode jump offset if left and right subtrees exists + ulong *jumpOffset = 0, jumpBits = 0; + if (leftBits != 0 && rightBits != 0) + jumpOffset = DeltaEncode(leftBits, jumpBits); + + // bits is the sum of keyBits, jumpBits, leftBits and rightBits + bits = keyBits + jumpBits + leftBits + rightBits; + ulong *output = new ulong[bits / W + 1]; + + /** + * Write "output" + */ + BitCopy(output, keyDelta, 0, keyBits); + delete [] keyDelta; + ulong offset = keyBits; + + if (jumpBits != 0) + { + BitCopy(output, jumpOffset, offset, jumpBits); + offset += jumpBits; + delete [] jumpOffset; + } + + BitCopy(output, leftSub, offset, leftBits); + offset += leftBits; + BitCopy(output, rightSub, offset, rightBits); + offset += rightBits; + + assert(offset == bits); + if (leftBits != 0) + delete [] leftSub; + if (rightBits != 0) + delete [] rightSub; + return output; +} + +void BSGAP::BitCopy(ulong *dest, ulong *src, ulong offset, ulong len) +{ + if (len == 0) + return; + ulong i; + + if (len <= W) + { + Tools::SetVariableField(dest, len, offset, *src); + return; + } + + for (i = 0; i < len/W; i ++) + Tools::SetVariableField(dest, W, offset + i * W, src[i]); + + if (i * W < len) + Tools::SetVariableField(dest, len - i*W, offset + i * W, src[i]); +} + +ulong BSGAP::FindMedian(ulong *B, ulong firstKey, ulong l, ulong r, ulong n) +{ + // Linear scan: slow but affects only the construction time + n = n/2 + 1; + if (l != firstKey) + l ++; // Skip left boundary + + while (n && l <= r) + { + if (Tools::GetField(B, 1, l)) + n --; + l ++; + } + return l - 1; +} + +ulong BSGAP::DeltaDecode(ulong *B, ulong &offset) +{ + // Dense coding + // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf + const unsigned s = 180; // FIXME These should be class variables + const unsigned c = 256 - s; // FIXME Choose values! + const unsigned b = 8; // one byte. + + ulong i = 0; + ulong base = 0; + ulong tot = s; + while (Tools::GetVariableField(B, b, offset) < c) + { + i = i*c + Tools::GetVariableField(B, b, offset); + offset += b; + base += tot; + tot = tot * c; + } + i = i * s + (Tools::GetVariableField(B, b, offset) - c); + offset += b; + return i + base; +} + + +ulong * BSGAP::DeltaEncode(ulong value, ulong &bits, bool onlyPositive, bool negative) +{ + // Dense coding + // http://www.dcc.uchile.cl/~gnavarro/ps/ir06.pdf + const unsigned s = 180; // FIXME These should be class variables + const unsigned c = 256 - s; // FIXME Choose values! + const unsigned b = 8; // one byte + + // Calculate size first: + bits = b; + ulong x = value / s; + while (x > 0) + { + --x; + bits += b; + x = x/c; + } + + // bits is now the length of the whole codeword + ulong *B = new ulong[bits/W + 1]; + + // output codeword (backwards): + unsigned offset = bits - b; + ulong output = c + (value % s); + Tools::SetVariableField(B, b, offset, output); + + x = value / s; + while (x > 0) + { + --x; + offset -= b; + + output = x % c; + Tools::SetVariableField(B, b, offset, output); + + x = x/c; + } + + return B; +} + + +/** + * Saving the following data fields: + * ulong u, n; // Universe size, number of 1-bits in B + * ulong topCount; // Top structure and the number of substructures + * ulong bitsInP; + * ulong *P; // Pointer to BSGAP structures + * unsigned b; // Subdictionary size (\log^2 n) + * BlockArray *offsetA; // Array of pointers (into bitvector P) + * BlockArray *firstKeyA; // Array of first key positions of the substructures + */ +void BSGAP::Save(FILE *file) const +{ + if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (u)."); + + if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (n)."); + + if (std::fwrite(&(this->topCount), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (topCount)."); + + if (std::fwrite(&(this->bitsInP), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (bitsInP)."); + + for (ulong offset = 0; offset < bitsInP/W+1; offset ++) + { + if (std::fwrite(this->P+offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (P)."); + } + + if (std::fwrite(&(this->b), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("BSGAP::Save(): file write error (b)."); + + offsetA->Save(file); + firstKeyA->Save(file); +} + +/** + * Load from file + */ +BSGAP::BSGAP(FILE *file) +{ + if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (u)."); + + if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (n)."); + + if (std::fread(&(this->topCount), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (topCount)."); + + if (std::fread(&(this->bitsInP), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (bitsInP)."); + + P = new ulong[bitsInP/W+1]; + for (ulong offset = 0; offset < bitsInP/W+1; offset ++) + { + if (std::fread(this->P+offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (P)."); + } + + if (std::fread(&(this->b), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("BSGAP::Load(): file read error (b)."); + + offsetA = new BlockArray(file); + firstKeyA = new BlockArray(file); +} + diff --git a/BSGAP.h b/BSGAP.h new file mode 100644 index 0000000..a1fba5d --- /dev/null +++ b/BSGAP.h @@ -0,0 +1,119 @@ +/** + * Binary-searchable gap encoding scheme (BSGAP) + * Niko Välimäki + * + * Based on: + * + * Ankur Gupta and Wing-Kai Hon and Rahul Shah and Jeffrey Scott Vitter. Compressed data structures: + * Dictionaries and data-aware measures, Theor. Comput. Sci., Volume 387, Issue 3 (November 2007). + */ +#ifndef _BSGAP_H_ +#define _BSGAP_H_ + +#include "Tools.h" +#include "BlockArray.h" + +class BSGAP +{ +public: + BSGAP(ulong *, ulong, bool = false, ulong = 0); + BSGAP(FILE *); + ~BSGAP(); + ulong rank(ulong); + ulong rank0(ulong); + ulong rank(bool b, ulong i) {return b?rank(i):i+1-rank(i);}; + ulong select(ulong); + ulong select0(ulong); + ulong select(bool b, ulong i) {return b?select(i):select0(i);}; + + bool IsBitSet(ulong pos); + // Returns also the rank_1(pos) via the second param + bool IsBitSet(ulong pos, ulong *); + + ulong size() {return u;}; + ulong bitsSet() {return n;} + void Save(FILE *) const; +private: + static const uchar bit_table[]; + ulong u, n; // Universe size, number of 1-bits in B + ulong topCount; // Top structure and the number of substructures + ulong bitsInP; // Length of bit vector P + ulong *P; // Pointer to BSGAP structures + unsigned b; // Subdictionary size (\log^2 n) + BlockArray *offsetA; // Array of pointers (into bitvector P) + BlockArray *firstKeyA; // Array of first key positions of the substructures + + ulong rank(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + bool IsBitSet(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool, ulong *); + ulong select(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + ulong select0(ulong *, ulong, ulong, ulong, ulong, ulong, ulong, bool); + ulong * GetSubtree(ulong *, ulong, ulong, ulong, ulong, bool, ulong &); + ulong DeltaDecode(ulong *, ulong &); + ulong DeltaDecode(ulong *, ulong &, bool &); + ulong * DeltaEncode(ulong, ulong &, bool = true, bool = false); + void BitCopy(ulong *, ulong *, ulong, ulong); + ulong FindMedian(ulong *, ulong, ulong, ulong, ulong); + unsigned FindLowestBit(ulong); + + inline bool IsSubtreeFull(ulong firstKey, ulong l, ulong r, ulong n) + { + if (l == firstKey) + n --; + + if (r - l - 1 == n) + return true; + return false; + } + /** + * Returns the number of zeros in the left subtree + */ + inline ulong numberOfZeros(ulong firstKey, ulong l, ulong key, ulong n) + { + ulong z = key - l - 1 - n/2; + if (l == firstKey) + ++z; + if (z == -1lu) + z = 0; + return z; + } + + inline ulong FindRightSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n) + { + if (n - n/2 - 1 == 0) + return 0; // Does not exists + + if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1)) + return 0; // Right subtree is full (does not exists) + + if (n/2 == 0) + return offset; // Left subtree does not exists + + if (IsSubtreeFull(firstKey, l, key, n/2)) + return offset; // Left subtree is full (does not exists) + + ulong len = DeltaDecode(B, offset); + return offset + len; // Skipping left subtree + } + + inline ulong FindLeftSubtree(ulong *B, ulong offset, ulong firstKey, ulong key, ulong l, ulong r, ulong n) + { + if (n/2 == 0) + return 0; // Does not exists + + if (IsSubtreeFull(firstKey, l, key, n/2)) + return 0; // Left subtree is full + + if (n - n/2 - 1 == 0) + return offset; // Right subtree does not exists + + if (IsSubtreeFull(firstKey, key, r, n - n/2 - 1)) + return offset; // Right subtree is full + + DeltaDecode(B, offset); // Calculate new offset + return offset; // Left subtree is the first one + } + +}; + +#endif diff --git a/BitRank.cpp b/BitRank.cpp index 61dcc42..bccd82c 100644 --- a/BitRank.cpp +++ b/BitRank.cpp @@ -92,8 +92,6 @@ BitRank::BitRank(ulong *bitarray, ulong n, bool owner) { data=bitarray; this->owner = owner; this->n=n; // length of bitarray in bits - b = W; // b is a word - s=b*superFactor; ulong aux=(n+1)%W; if (aux != 0) integers = (n+1)/W+1; @@ -103,7 +101,7 @@ BitRank::BitRank(ulong *bitarray, ulong n, bool owner) { } BitRank::~BitRank() { - delete [] Rs; + delete Rs; delete [] Rb; if (owner) delete [] data; } @@ -112,16 +110,16 @@ BitRank::~BitRank() { void BitRank::BuildRank() { ulong num_sblock = n/s; - ulong num_block = n/b; - Rs = new ulong[num_sblock+1];//+1 we add the 0 pos + ulong num_block = n/W; + Rs = new BlockArray(num_sblock+1, Tools::CeilLog2(n));//+1 we add the 0 pos Rb = new uchar[num_block+1];//+1 we add the 0 pos ulong j; - Rs[0] = 0lu; + (*Rs)[0] = 0lu; for (j=1;j<=num_sblock;j++) { - Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1]; + (*Rs)[j]=BuildRankSub((j-1)*superFactor,superFactor)+(*Rs)[j-1]; } Rb[0]=0; @@ -148,7 +146,7 @@ ulong BitRank::BuildRankSub(ulong ini, ulong bloques){ //this rank ask from 0 to n-1 ulong BitRank::rank(ulong i) { ++i; // the following gives sum of 1s before i - return Rs[i>>8]+Rb[i>>wordShift] + return (*Rs)[i>>8]+Rb[i>>wordShift] +popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1)); } @@ -165,14 +163,14 @@ ulong BitRank::select(ulong x) { ulong l=0, r=n/s; ulong mid=(l+r)/2; - ulong rankmid = Rs[mid]; + ulong rankmid = (*Rs)[mid]; while (l<=r) { if (rankmid>8; @@ -233,14 +231,14 @@ ulong BitRank::select0(ulong x) { ulong l=0, r=n/s; ulong mid=(l+r)/2; - ulong rankmid = mid * s - Rs[mid]; + ulong rankmid = mid * s - (*Rs)[mid]; while (l<=r) { if (rankmid>8; @@ -298,3 +296,39 @@ bool BitRank::IsBitSet(ulong i) { ulong BitRank::NumberOfBits() { return n; } + +/** + * Saving data fields: + + ulong n; + ulong *data; //here is the bit-arra + */ +void BitRank::Save(FILE *file) +{ + if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BitRank::Save(): file write error (n)."); + + for (ulong offset = 0; offset < integers; ++offset) + if (std::fwrite(this->data + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BitRank::Save(): file write error (data)."); +} + +BitRank::BitRank(FILE *file) +{ + owner = 1; + if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BitRank::Load(): file read error (n)."); + + ulong aux=(n+1)%W; + if (aux != 0) + integers = (n+1)/W+1; + else + integers = (n+1)/W; + + data = new ulong[integers]; + for (ulong offset = 0; offset < integers; ++offset) + if (std::fread(this->data + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BitRank::Load(): file read error (data)."); + + BuildRank(); +} diff --git a/BitRank.h b/BitRank.h index c7783d2..49d3647 100644 --- a/BitRank.h +++ b/BitRank.h @@ -21,6 +21,7 @@ #ifndef _BITSTREAM_H_ #define _BITSTREAM_H_ +#include "BlockArray.h" #include "Tools.h" class BitRank { @@ -33,18 +34,20 @@ private: static const unsigned wordShift = 6; static const unsigned superFactor = 4; // 256 bit blocks #endif + static const unsigned s = W*superFactor; ulong *data; //here is the bit-array bool owner; ulong n,integers; - unsigned b,s; - ulong *Rs; //superblock array + BlockArray *Rs; //superblock array uchar *Rb; //block array ulong BuildRankSub(ulong, ulong); //internal use of BuildRank void BuildRank(); //crea indice para rank public: BitRank(ulong *, ulong, bool); + BitRank(FILE *); ~BitRank(); //destructor + void Save(FILE *); ulong rank(ulong i); //Rank from 0 to n-1 ulong select(ulong x); // gives the position of the x:th 1. ulong select0(ulong x); // gives the position of the x:th 0. diff --git a/BlockArray.h b/BlockArray.h index cc60c35..99aff91 100644 --- a/BlockArray.h +++ b/BlockArray.h @@ -1,7 +1,8 @@ #ifndef _BLOCK_ARRAY_H_ #define _BLOCK_ARRAY_H_ -#include #include "Tools.h" +#include +#include class BlockArray { @@ -43,7 +44,44 @@ public: ulong spaceInBits() { return n*blockLength+W; // plus 4 ulong's } + + /** + * Saving data fields: + * ulong n; + * ulong blockLength; + * ulong* data; + */ + void Save(FILE *file) const + { + if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BlockArray::Save(): file write error (n)."); + if (std::fwrite(&(this->blockLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BlockArray::Save(): file write error (blockLength)."); + for (ulong offset = 0; offset < n*blockLength/W+1; offset ++) + { + if (std::fwrite(this->data + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BlockArray::Save(): file write error (data)."); + } + } + + /** + * Load from file + */ + BlockArray(FILE *file) + { + if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BlockArray::Load(): file read error (n)."); + if (std::fread(&(this->blockLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BlockArray::Load(): file read error (blockLength)."); + + data = new ulong[n*blockLength/W+1]; + for (ulong offset = 0; offset < n*blockLength/W+1; offset ++) + { + if (std::fread(this->data + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("BlockArray::Load(): file read error (data)."); + } + } }; #endif diff --git a/CSA.cpp b/CSA.cpp index 5719154..a86407d 100644 --- a/CSA.cpp +++ b/CSA.cpp @@ -35,6 +35,8 @@ using std::map; using SXSI::TextCollection; +// Save file version info +const uchar CSA::versionFlag = 2; //////////////////////////////////////////////////////////////////////////// // Class CSA::THuffAlphabetRank @@ -120,6 +122,24 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { delete bitrank; } +/** + * Saving data fields: + BitRank *bitrank; + bool leaf; + uchar ch; + left child; + right child; +*/ +void CSA::THuffAlphabetRank::Save(FILE *file) +{ + +} + +CSA::THuffAlphabetRank::THuffAlphabetRank(FILE *file) +{ + + +} /////////////////////////////////////////////f/////////////////////////////// // Class CSA @@ -129,9 +149,9 @@ CSA::THuffAlphabetRank::~THuffAlphabetRank() { * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. */ CSA::CSA(unsigned samplerate) - : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), - codetable(0), dynFMI(0), numberOfTexts(0), maxTextLength(0), endmarkerDocId(0), - textLength(0), textStartPos(0) + : n(0), samplerate(0), alphabetrank(0), sampled(0), suffixes(0), suffixDocId(0), positions(0), + codetable(0), dynFMI(0), numberOfTexts(0), numberOfAllTexts(0), maxTextLength(0), endmarkerDocId(0), + textLength(0), textStartPos(0), emptyTextRank(0) { this->samplerate = samplerate; @@ -164,20 +184,38 @@ void CSA::InsertText(uchar const * text) if (m > maxTextLength) maxTextLength = m; // Store length of the longest text seen so far. - this->n += m; - this->numberOfTexts ++; - dynFMI->addText(text, m); + if (m > 1) + { + this->n += m; + this->numberOfTexts ++; + this->numberOfAllTexts ++; + dynFMI->addText(text, m); + } + else + { + emptyTextId.insert(numberOfAllTexts); // FIXME Using too much space here + this->numberOfAllTexts ++; // Replace with dynamic bitvector + } } void CSA::MakeStatic() { // Sanity check: if (dynFMI == 0) + throw std::runtime_error("CSA::MakeStatic(): Data structure is already static (dynFMI == 0)."); + + // Bitvector of empty texts { - std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl; - std::exit(0); + //std::cout << std::endl << "texts: " << numberOfTexts << ", all texts " << numberOfAllTexts << ", size : " << emptyTextId.size() <::const_iterator it = emptyTextId.begin(); it != emptyTextId.end(); ++it) + Tools::SetField(tempB, 1, (*it), 1); + emptyTextId.clear(); + emptyTextRank = new BSGAP(tempB, numberOfAllTexts, true); } - + uchar *bwt = dynFMI->getBWT(); /*printf("123456789012345678901234567890123456789\n"); for (TextPosition i = 0; i < n; i ++) @@ -199,24 +237,27 @@ void CSA::MakeStatic() delete dynFMI; dynFMI = 0; - makewavelet(bwt); + makewavelet(bwt); // Deletes bwt! + bwt = 0; // Calculate BWT end-marker position (of last inserted text) // and the length of the first text (counter l): ulong i = 0; ulong l = 1; - while (bwt[i] != '\0') + + uchar c = alphabetrank->access(i); + while (c != '\0') { - uchar c = bwt[i]; i = C[c]+alphabetrank->rank(c, i)-1; ++l; + c = alphabetrank->access(i); } bwtEndPos = i; + assert(bwtEndPos < n); //printf("bwtEndPos = %lu\n", bwtEndPos); - delete [] bwt; - // Build up arrays for text length and starting positions + // FIXME Remove, temp data textLength = new BlockArray(numberOfTexts, Tools::CeilLog2(maxTextLength)); textStartPos = new BlockArray(numberOfTexts, Tools::CeilLog2(this->n)); (*textLength)[0] = l; @@ -228,43 +269,71 @@ void CSA::MakeStatic() bool CSA::EmptyText(DocId k) const { - assert(k < (DocId)numberOfTexts); - return (1 == (*textLength)[k]); + assert(k < (DocId)numberOfTexts); + if (emptyTextRank->IsBitSet(k)) + return true; + return false; } uchar* CSA::GetText(DocId k) const { assert(k < (DocId)numberOfTexts); - uchar* result = new uchar[(*textLength)[k]]; + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + // Map to non-empty text + k -= textRank; //emptyTextRank->rank(k); + TextPosition i = k; - TextPosition j = (*textLength)[k] - 1; - result[j] = '\0'; + // FIXME + string result; //TextPosition j = (*textLength)[k] - 1; - uchar c = alphabetrank->charAtPos(i); + uchar c = alphabetrank->access(i); while (c != '\0') { - --j; - result[j] = c; + result.push_back(c); i = C[c]+alphabetrank->rank(c,i)-1; - c = alphabetrank->charAtPos(i); // "next" char. + c = alphabetrank->access(i); // "next" char. } - assert(j == 0); - return result; + + // Convert to uchar (FIXME return string?) + i = result.size(); + uchar* res = new uchar[i+1]; + res[i] = '\0'; + for (ulong j = 0; j < i; ++j) + res[i-j-1] = result[j]; + return res; } - +/* + * Not supported uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const { assert(k < (DocId)numberOfTexts); assert(j < (*textLength)[k]); assert(i <= j); + ulong textRank = 0; + if (emptyTextRank->IsBitSet(k, &textRank)) + { + uchar* result = new uchar[1]; + result[0] = 0; + return result; + } + + // Map to non-empty text + k -= textRank; // emptyTextRank->rank(k); + // Start position of k'th text ulong start = (*textStartPos)[k]; return Substring(i + start, j-i+1); -} + }*/ /****************************************************************** * Existential queries @@ -286,7 +355,7 @@ bool CSA::IsPrefix(uchar const * pattern) const bool CSA::IsSuffix(uchar const *pattern) const { - // Here counting is as fast as isSuffix(): + // Here counting is as fast as IsSuffix(): if (CountSuffix(pattern) > 0) return true; return false; @@ -295,8 +364,12 @@ bool CSA::IsSuffix(uchar const *pattern) const bool CSA::IsEqual(uchar const *pattern) const { TextPosition m = std::strlen((char *)pattern); - //if (m == 0) - // return false; // FIXME Check for empty texts! + if (m == 0) + { + if (numberOfAllTexts - numberOfTexts > 0u) + return true; // Empty texts exists + return false; // No empty texts exists + } TextPosition sp = 0, ep = 0; // Match including end-marker @@ -337,7 +410,7 @@ unsigned CSA::CountPrefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return numberOfTexts; + return numberOfAllTexts; TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); @@ -350,7 +423,7 @@ unsigned CSA::CountSuffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return numberOfTexts; + return numberOfAllTexts; TextPosition sp = 0, ep = 0; // Search with end-marker @@ -362,8 +435,8 @@ unsigned CSA::CountSuffix(uchar const * pattern) const unsigned CSA::CountEqual(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); -// if (m == 0) -// return ; // FIXME Test for empty texts. + if (m == 0) + return numberOfAllTexts - numberOfTexts; // Empty texts. TextPosition sp = 0, ep = 0; // Match including end-marker @@ -375,6 +448,10 @@ unsigned CSA::CountEqual(uchar const *pattern) const unsigned CSA::CountContains(uchar const * pattern) const { + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return numberOfAllTexts; // Total number of texts. + // Here counting is as slow as fetching the result set // because we would have to filter out occ's that fall within same document. TextCollection::document_result result = Contains(pattern); @@ -395,7 +472,7 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; Search(pattern, m, &sp, &ep); @@ -407,6 +484,9 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const if (resultSize == 0) return result; + result.reserve(resultSize); // Try to avoid reallocation. + + // Iterate through end-markers in [sp,ep]: unsigned i = 0; if (sp > 0) i = alphabetrank->rank(0, sp - 1); @@ -414,7 +494,10 @@ TextCollection::document_result CSA::Prefix(uchar const * pattern) const { // End-marker we found belongs to the "preceeding" doc in the collection DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); + -- resultSize; ++ i; } @@ -426,37 +509,46 @@ TextCollection::document_result CSA::Suffix(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all 1...k TextPosition sp = 0, ep = 0; // Search with end-marker Search(pattern, m+1, &sp, &ep); TextCollection::document_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); } + // Assert: c == '\0' OR sampled->IsBitSet(i) + if (c == '\0') { // Rank among the end-markers in BWT unsigned endmarkerRank = alphabetrank->rank(0, i) - 1; // End-marker that we found belongs to the "preceeding" doc in collection: - DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); } else // Sampled position { - TextPosition textPos = (*suffixes)[sampled->rank(i)-1]; - result.push_back(DocIdAtTextPos(textPos)); + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(docId); } } @@ -467,7 +559,7 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const { TextPosition m = strlen((char const *)pattern); if (m == 0) - return TextCollection::document_result(); + return TextCollection::document_result(); // FIXME Should return all empty texts TextPosition sp = 0, ep = 0; // Match including end-marker @@ -480,14 +572,19 @@ TextCollection::document_result CSA::Equal(uchar const *pattern) const if (resultSize == 0) return result; + result.reserve(resultSize); // Try to avoid reallocation. + unsigned i = 0; if (sp > 0) i = alphabetrank->rank(0, sp - 1); while (resultSize) { - // End-marker we found belongs to the "previous" doc in the collection + // End-marker we found belongs to the "previous" doc in the collection DocId docId = ((*endmarkerDocId)[i] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(docId); + -- resultSize; ++ i; } @@ -505,19 +602,20 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); - + // We want unique document indentifiers, using std::set to collect them std::set resultSet; + ulong sampled_rank_i = 0; // Check each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + c = alphabetrank->access(i); } if (c == '\0') { @@ -530,15 +628,17 @@ TextCollection::document_result CSA::Contains(uchar const * pattern) const } else { - TextPosition textPos = (*suffixes)[sampled->rank(i)-1]; - resultSet.insert(DocIdAtTextPos(textPos)); + DocId di = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; + assert((unsigned)di < numberOfTexts); + resultSet.insert(di); } } - // Convert std::set to std::vector (TODO better way to construct result vector?) - TextCollection::document_result result; - for (std::set::iterator it = resultSet.begin(); it != resultSet.end(); ++it) - result.push_back(*it); + // Convert std::set to std::vector + TextCollection::document_result result(resultSet.begin(), resultSet.end()); + // Map to doc ID's + for (document_result::iterator it = result.begin(); it != result.end(); ++it) + *it = emptyTextRank->select0(*it+1); return result; } @@ -556,24 +656,26 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const { TextPosition m = strlen((char *)pattern); if (m == 0) - return full_result(); + return full_result(); // FIXME Throw exception? TextPosition sp = 0, ep = 0; // Search all occurrences Search(pattern, m, &sp, &ep); full_result result; + result.reserve(ep-sp+1); // Try to avoid reallocation. + ulong sampled_rank_i = 0; // Report each occurrence for (; sp <= ep; ++sp) { TextPosition i = sp; TextPosition dist = 0; - uchar c = alphabetrank->charAtPos(i); - while (c != '\0' && !sampled->IsBitSet(i)) + uchar c = alphabetrank->access(i); + while (c != '\0' && !sampled->IsBitSet(i, &sampled_rank_i)) { - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping - c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c,i)-1; + c = alphabetrank->access(i); ++ dist; } if (c == '\0') @@ -583,14 +685,19 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const // End-marker that we found belongs to the "preceeding" doc in collection: DocId docId = ((*endmarkerDocId)[endmarkerRank] + 1) % numberOfTexts; + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); result.push_back(make_pair(docId, dist)); } else { - TextPosition textPos = (*suffixes)[sampled->rank(i)-1] + dist; - // Read document identifier and its starting position in text order - DocId docId = DocIdAtTextPos(textPos); - result.push_back(make_pair(docId, textPos - (*textStartPos)[docId])); + TextPosition textPos = (*suffixes)[sampled_rank_i-1]+dist; //sampled->rank(i)-1] + dist; + DocId docId = (*suffixDocId)[sampled_rank_i-1]; //sampled->rank(i)-1]; +// textPos = textPos - (*textStartPos)[docId]; // Offset inside the text + + // Map to doc ID: + docId = emptyTextRank->select0(docId+1); + result.push_back(make_pair(docId, textPos)); } } @@ -598,106 +705,59 @@ TextCollection::full_result CSA::FullContains(uchar const * pattern) const } -/** - * Finds document identifier for given text position - * - * Starting text position of the document is stored into second parameter. - * Binary searching on text starting positions. - */ -TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const -{ - assert(i < n); - - DocId a = 0; - DocId b = numberOfTexts - 1; - while (a < b) - { - DocId c = a + (b - a)/2; - if ((*textStartPos)[c] > i) - b = c - 1; - else if ((*textStartPos)[c+1] > i) - return c; - else - a = c + 1; - } - - assert(a < (DocId)numberOfTexts); - assert(i > (*textStartPos)[a]); - assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); - return a; -} - -/** - * Count end-markers in given interval - */ -unsigned CSA::CountEndmarkers(TextPosition sp, TextPosition ep) const -{ - if (sp > ep) - return 0; - - ulong ranksp = 0; - if (sp != 0) - ranksp = alphabetrank->rank(0, sp - 1); - - return alphabetrank->rank(0, ep) - ranksp; -} - /** * Save index to a file handle * * Throws a std::runtime_error exception on i/o error. * First byte that is saved represents the version number of the save file. - * In version 1 files, the data fields are: - * version info; - * samplerate; - * length of the BWT; - * end marker position in BWT; - * BWT string of length n; - * number of texts; - * length of longest text; - * - * array of pairs. - * - * TODO: Save the data structures instead of BWT sequence? - * TODO: Don't save textLength and textStartPos arrays. + * In version 2 files, the data fields are: + * uchar versionFlag; + TextPosition n; + unsigned samplerate; + unsigned C[256]; + TextPosition bwtEndPos; + static_sequence * alphabetrank; + BSGAP *sampled; + BlockArray *suffixes; + BlockArray *suffixDocId; + unsigned numberOfTexts; + unsigned numberOfAllTexts; + ulong maxTextLength; + BlockArray *endmarkerDocId; + BSGAP *emptyTextRank; */ void CSA::Save(FILE *file) const { - // Saving version 1 data: - uchar versionFlag = 1; + // Saving version info: if (std::fwrite(&versionFlag, 1, 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (version flag)."); + if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (n)."); if (std::fwrite(&(this->samplerate), sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (samplerate)."); - if (std::fwrite(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (n)."); + for(ulong i = 0; i < 256; ++i) + if (std::fwrite(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (C table)."); if (std::fwrite(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (bwt end position)."); - - for (ulong offset = 0; offset < n; offset ++) - { - uchar c = alphabetrank->charAtPos(offset); - if (std::fwrite(&c, sizeof(uchar), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (bwt sequence)."); - } - unsigned r = numberOfTexts; - if (std::fwrite(&r, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (r)."); + alphabetrank->save(file); + sampled->Save(file); + suffixes->Save(file); + suffixDocId->Save(file); + if (std::fwrite(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfTexts)."); + if (std::fwrite(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Save(): file write error (numberOfAllTexts)."); if (std::fwrite(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) throw std::runtime_error("CSA::Save(): file write error (maxTextLength)."); - - for (r = 0; r < numberOfTexts; ++ r) - { - if (std::fwrite(&((*textLength)[r]), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (text length)."); - if (std::fwrite(&((*textStartPos)[r]), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Save(): file write error (text start)."); - } + + endmarkerDocId->Save(file); + emptyTextRank->Save(file); } @@ -713,70 +773,61 @@ void CSA::Load(FILE *file, unsigned samplerate) delete dynFMI; dynFMI = 0; delete alphabetrank; alphabetrank = 0; delete sampled; sampled = 0; - delete [] suffixes; suffixes = 0; - delete [] positions; positions = 0; + delete suffixes; suffixes = 0; + delete suffixDocId; suffixDocId = 0; + delete positions; positions = 0; delete [] codetable; codetable = 0; delete endmarkerDocId; endmarkerDocId = 0; + delete emptyTextRank; emptyTextRank = 0; + // FIXME Remove following: delete textLength; textLength = 0; delete textStartPos; textStartPos = 0; this->maxTextLength = 0; this->numberOfTexts = 0; + this->numberOfAllTexts = 0; this->samplerate = samplerate; this->n = 0; - uchar versionFlag = 0; - if (std::fread(&versionFlag, 1, 1, file) != 1) + uchar verFlag = 0; + if (std::fread(&verFlag, 1, 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (version flag)."); - if (versionFlag != 1) - throw std::runtime_error("CSA::Load(): invalid start byte."); + if (verFlag != CSA::versionFlag) + throw std::runtime_error("CSA::Load(): invalid save file version."); + if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (n)."); if (std::fread(&samplerate, sizeof(unsigned), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (samplerate)."); - if (this->samplerate == 0) - this->samplerate = samplerate; +// FIXME samplerate can not be changed during load. +// if (this->samplerate == 0) +// this->samplerate = samplerate; - if (std::fread(&(this->n), sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (n)."); + for(ulong i = 0; i < 256; ++i) + if (std::fread(this->C + i, sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (C table)."); if (std::fread(&(this->bwtEndPos), sizeof(TextPosition), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (bwt end position)."); - uchar *bwt = new uchar[n]; - for (ulong offset = 0; offset < n; offset ++) - if (std::fread((bwt + offset), sizeof(uchar), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (bwt sequence)."); - - unsigned r = 0; - if (std::fread(&r, sizeof(unsigned), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (r)."); + alphabetrank = static_sequence::load(file); + sampled = new BSGAP(file); + suffixes = new BlockArray(file); + suffixDocId = new BlockArray(file); + if (std::fread(&(this->numberOfTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfTexts)."); + if (std::fread(&(this->numberOfAllTexts), sizeof(unsigned), 1, file) != 1) + throw std::runtime_error("CSA::Load(): file read error (numberOfAllTexts)."); if (std::fread(&(this->maxTextLength), sizeof(ulong), 1, file) != 1) throw std::runtime_error("CSA::Load(): file read error (maxTextLength)."); - - // Build up arrays for text length and starting positions - textLength = new BlockArray(r, Tools::CeilLog2(maxTextLength)); - textStartPos = new BlockArray(r, Tools::CeilLog2(this->n)); - while (r > 0) - { - TextPosition length = 0, start = 0; - if (std::fread(&length, sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (text length)."); - if (std::fread(&start, sizeof(TextPosition), 1, file) != 1) - throw std::runtime_error("CSA::Load(): file read error (text start)."); - - (*textLength)[numberOfTexts] = length; - (*textStartPos)[numberOfTexts] = start; - ++numberOfTexts; - --r; - } + endmarkerDocId = new BlockArray(file); + emptyTextRank = new BSGAP(file); - // Construct data structures - makewavelet(bwt); - delete [] bwt; - maketables(); // FIXME: this will redo text length tables + // FIXME Construct data structures with new samplerate + //maketables(); // FIXME: this will redo text length tables } @@ -785,6 +836,8 @@ void CSA::Load(FILE *file, unsigned samplerate) * Rest of the functions follow... */ +/* + * Not supported uchar * CSA::Substring(TextPosition i, TextPosition l) const { uchar *result = new uchar[l + 1]; @@ -818,7 +871,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const for (dist = 0; dist < skip + l; dist++) { - int c = alphabetrank->charAtPos(j); + int c = alphabetrank->access(j); //charAtPos(j, &alphabetrank_tmp); if (c == '\0') { // Rank among the end-markers in BWT @@ -833,7 +886,7 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const result[l] = 0u; return result; } - +*/ /*TextPosition CSA::inverse(TextPosition i) { TextPosition skip = samplerate - i % samplerate; @@ -858,7 +911,8 @@ uchar * CSA::Substring(TextPosition i, TextPosition l) const return j; }*/ -ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const +{ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) int c = (int)pattern[m-1]; TextPosition i=m-1; @@ -877,7 +931,7 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return ep - sp + 1; else return 0; - } +} /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) { @@ -890,12 +944,13 @@ ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, return 0; }*/ -CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma) +CSA::TextPosition CSA::Lookup(TextPosition i) const { TextPosition dist=0; while (!sampled->IsBitSet(i)) { - int c = alphabetrank->charAtPos(i); + + int c = alphabetrank->access(i); if (c == '\0') { // End-markers are sampled @@ -903,7 +958,7 @@ CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(sample DocId docId = (*endmarkerDocId)[endmarkerRank]; return (*textStartPos)[docId] + dist; } - i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + i = C[c]+alphabetrank->rank(c,i)-1; ++dist; } @@ -915,12 +970,14 @@ CSA::~CSA() { delete alphabetrank; delete sampled; delete suffixes; + delete suffixDocId; delete positions; delete [] codetable; delete endmarkerDocId; delete textLength; delete textStartPos; + delete emptyTextRank; } void CSA::makewavelet(uchar *bwt) @@ -948,9 +1005,30 @@ void CSA::makewavelet(uchar *bwt) C[i]=C[i-1]+prev; prev = temp; } - this->codetable = node::makecodetable(bwt,n); - alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); - //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); +// this->codetable = node::makecodetable(bwt,n); +// alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); +// delete [] bwt; + //alphabetrank = new RLWaveletTree(bwt, n); // Deletes bwt! + + uint *text = new uint[n]; + for (i = 0; i < n; ++i) // Silly + text[i] = bwt[i]; + delete [] bwt; + bwt = 0; + + alphabet_mapper * am = new alphabet_mapper_none(); + static_bitsequence_builder * bmb = new static_bitsequence_builder_rrr02(16); // FIXME samplerate? + + //cout << "Building Huffman table..."; cout.flush(); + + wt_coder * wtc = new wt_coder_huff(text,n,am); + + //cout << "done" << endl; cout.flush(); + //cout << "Building static_sequence..."; cout.flush(); + + alphabetrank = new static_sequence_wvtree(text,n,wtc,bmb,am); + delete [] text; + text = 0; /* for (i = 0; i < n; ++i) { @@ -966,9 +1044,9 @@ void CSA::maketables() ulong *sampledpositions = new ulong[n/W+1]; unsigned ceilLog2n = Tools::CeilLog2(n); - suffixes = new BlockArray(sampleLength, ceilLog2n); positions = new BlockArray(sampleLength, ceilLog2n); - + BlockArray* tmpSuffix = new BlockArray(sampleLength, ceilLog2n); + // Mapping from end-markers to doc ID's: endmarkerDocId = new BlockArray(numberOfTexts, Tools::CeilLog2(numberOfTexts)); @@ -977,6 +1055,7 @@ void CSA::maketables() sampledpositions[i]=0lu; ulong x,p=bwtEndPos; + ulong sampleCount = 0; // Keeping track of text position of end-markers seen ulong posOfSuccEndmarker = n; DocId textId = numberOfTexts; @@ -988,12 +1067,14 @@ void CSA::maketables() // i substitutes SA->GetPos(i) x=(i==n-1)?0:i+1; - if (x % samplerate == 0) { + if (x % samplerate == 0 && posOfSuccEndmarker - x > samplerate) { Tools::SetField(sampledpositions,1,p,1); - (*positions)[x/samplerate] = p; + (*positions)[sampleCount] = p; + (*tmpSuffix)[sampleCount] = x; // FIXME remove + sampleCount ++; } - uchar c = alphabetrank->charAtPos(p); + uchar c = alphabetrank->access(p); if (c == '\0') { --textId; @@ -1033,14 +1114,66 @@ void CSA::maketables() assert(dynTextLength[i].second == (*textStartPos)[i]); } */ - sampled = new BitRank(sampledpositions,n,true); - + sampled = new BSGAP(sampledpositions,n,true); + sampleLength = sampled->rank(n-1); + assert(sampleCount == sampleLength); +// std::cout << ";sampleLength;" << sampleLength << std::endl; + // Suffixes == offset from text start position + suffixes = new BlockArray(sampleLength, Tools::CeilLog2(maxTextLength)); + suffixDocId = new BlockArray(sampleLength, Tools::CeilLog2(numberOfTexts)); + //suffixes: for(i=0; irank((*positions)[i]); if (j==0) j=sampleLength; - (*suffixes)[j-1] = (i*samplerate==n)?0:i*samplerate; - } + TextPosition textPos = (*tmpSuffix)[i]; //(i*samplerate==n)?0:i*samplerate; + (*suffixDocId)[j-1] = DocIdAtTextPos(textPos); // (*suffixes)[j-1]); + + assert((unsigned)DocIdAtTextPos(textPos) < numberOfTexts); + assert((*suffixDocId)[j-1] < numberOfTexts); + // calculate offset from text start: + (*suffixes)[j-1] = textPos - (*textStartPos)[(*suffixDocId)[j-1]]; + } + // FIXME Temp, remove + delete tmpSuffix; + tmpSuffix=0; + delete positions; + positions =0; + delete textLength; + textLength = 0; + delete textStartPos; + textStartPos = 0; +} + + +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const +{ + assert(i < n); + + DocId a = 0; + DocId b = numberOfTexts - 1; + while (a < b) + { + DocId c = a + (b - a)/2; + if ((*textStartPos)[c] > i) + b = c - 1; + else if ((*textStartPos)[c+1] > i) + return c; + else + a = c + 1; + } + + assert(a < (DocId)numberOfTexts); + assert(i >= (*textStartPos)[a]); + assert(i < (a == (DocId)numberOfTexts - 1 ? n : (*textStartPos)[a+1])); + return a; } CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n) diff --git a/CSA.h b/CSA.h index bd7b199..b9a1745 100644 --- a/CSA.h +++ b/CSA.h @@ -20,13 +20,20 @@ #ifndef _CSA_H_ #define _CSA_H_ +#include "dynFMI.h" + +// Include from XMLTree/libcds +#include +#include +#include +#include -#include +#include #include #include "BitRank.h" -#include "dynFMI.h" #include "TextCollection.h" #include "BlockArray.h" +#include "RLWaveletTree.h" /** * Implementation of the TextCollection interface @@ -58,7 +65,7 @@ public: void MakeStatic(); bool EmptyText(DocId) const; uchar* GetText(DocId) const; - uchar* GetText(DocId, TextPosition, TextPosition) const; +// uchar* GetText(DocId, TextPosition, TextPosition) const; bool IsPrefix(uchar const *) const; bool IsSuffix(uchar const *) const; @@ -86,8 +93,31 @@ public: void Load(FILE *, unsigned); void Save(FILE *) const; - // debug - TextPosition Lookup(TextPosition) const; + + // Debug FIXME Remove + void deleteWT() + { + delete alphabetrank; + alphabetrank = 0; + delete [] codetable; + codetable = 0; + } + void deleteSamples() + { + delete sampled; + sampled =0; + delete suffixes; + suffixes = 0; + delete positions; + positions = 0; + delete suffixDocId; + suffixDocId = 0; + } + void deleteEndmarker() + { + delete endmarkerDocId; + endmarkerDocId = 0; + } private: class TCodeEntry { @@ -110,7 +140,10 @@ private: bool leaf; public: THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); + THuffAlphabetRank(FILE *); ~THuffAlphabetRank(); + + void Save(FILE *); bool Test(uchar *, TextPosition); inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i @@ -151,7 +184,7 @@ private: } return true; } - inline int charAtPos(TextPosition i) const { + inline uchar access(TextPosition i) const { THuffAlphabetRank const * temp=this; while (!temp->leaf) { if (temp->bitrank->IsBitSet(i)) { @@ -163,7 +196,22 @@ private: temp = temp->left; } } - return (int)temp->ch; + return temp->ch; + } + + inline uchar charAtPos(ulong i, TextPosition *rank) const { + THuffAlphabetRank const * temp=this; + while (!temp->leaf) { + if (temp->bitrank->IsBitSet(i)) { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } else { + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + } + (*rank)=i; + return temp->ch; } }; @@ -202,23 +250,28 @@ private: static const unsigned char print = 1; static const unsigned char report = 1; + static const uchar versionFlag; TextPosition n; unsigned samplerate; unsigned C[256]; TextPosition bwtEndPos; - THuffAlphabetRank *alphabetrank; - BitRank *sampled; +// THuffAlphabetRank *alphabetrank; + // RLWaveletTree *alphabetrank; + static_sequence * alphabetrank; + BSGAP *sampled; BlockArray *suffixes; + BlockArray *suffixDocId; BlockArray *positions; TCodeEntry *codetable; DynFMI *dynFMI; // Total number of texts in the collection unsigned numberOfTexts; + // Total number of texts including empty texts + unsigned numberOfAllTexts; // Length of the longest text ulong maxTextLength; - - + // Array of document id's in the order of end-markers in BWT // Access by endmarkerDocId[rank_$(L, p) - 1]. BlockArray *endmarkerDocId; @@ -226,7 +279,11 @@ private: BlockArray *textLength; // Array of text starting positions (in the inserted order) BlockArray *textStartPos; - + + // FIXME Replace with a more succinct data structure + std::set emptyTextId; + BSGAP *emptyTextRank; + // Private methods uchar * BWT(uchar *); uchar * LoadFromFile(const char *); @@ -236,12 +293,27 @@ private: // Following are not part of the public API DocId DocIdAtTextPos(TextPosition) const; - unsigned CountEndmarkers(TextPosition, TextPosition) const; ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; TextPosition Inverse(TextPosition) const; TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; TextPosition Psi(TextPosition) const; uchar * Substring(TextPosition, TextPosition) const; + TextPosition Lookup(TextPosition) const; + + /** + * Count end-markers in given interval + */ + inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const + { + if (sp > ep) + return 0; + + ulong ranksp = 0; + if (sp != 0) + ranksp = alphabetrank->rank(0, sp - 1); + + return alphabetrank->rank(0, ep) - ranksp; + } }; #endif diff --git a/GapEncode.cpp b/GapEncode.cpp new file mode 100644 index 0000000..32102ac --- /dev/null +++ b/GapEncode.cpp @@ -0,0 +1,151 @@ +/** + * Gap encoding scheme + * Niko Välimäki + * + */ + +#include "GapEncode.h" +#include +using namespace std; + + +GapEncode::GapEncode(ulong *B, ulong u, bool freeB = false) +{ + ulong i, lastIndex, *bitRun, *runLength; + this->u = u; + this->numberOfRuns = 0; + bitRun = new ulong[u/W + 1]; + for (i = 0; i < u/W + 1; i ++) + bitRun[i] = 0; + + // Collect the runs of 1- or 0-bits + totalLength = 0; + uchar curBit = 0xFF; + lastIndex = 0; + for (i = 0; i < u; i ++) + if (Tools::GetField(B, 1, i) != curBit) + { + if (curBit == 1) + totalLength += i - lastIndex; + else if (Tools::GetField(B, 1, i)) + Tools::SetField(bitRun, 1, i, 1); + + lastIndex = i; + curBit = Tools::GetField(B, 1, i); + } + // Handle the last run + if (curBit == 1) + totalLength += i - lastIndex; + + runLength = new ulong[totalLength/W + 1]; + for (i = 0; i < totalLength/W + 1; i ++) + runLength[i] = 0; + + // Calculate run lengths + totalLength = 0; + curBit = 0xFF; + lastIndex = 0; + for (i = 0; i < u; i ++) + if (Tools::GetField(B, 1, i) != curBit) + { + if (curBit == 1) + { + totalLength += i - lastIndex; + Tools::SetField(runLength, 1, totalLength - 1, 1); + } + lastIndex = i; + curBit = Tools::GetField(B, 1, i); + } + // Handle the last run + if (curBit == 1) + { + totalLength += i - lastIndex; + Tools::SetField(runLength, 1, totalLength - 1, 1); + } + + // Construct BSGAP structures + this->bitRun = new BSGAP(bitRun, u, true, 0); + numberOfRuns = this->bitRun->rank(u-1); + this->runLength = new BSGAP(runLength, totalLength, true, 0); + + if (freeB) + delete [] B; +} + +GapEncode::~GapEncode() +{ + // Free BSGAP structures + delete bitRun; + delete runLength; +} + +ulong GapEncode::rank(ulong i) +{ + ulong pred, r = bitRun->rank(i); + if (r == 0) + return 0; + pred = bitRun->select(r); + + ulong bitsBeforeRun = 0; + if (r > 1) + bitsBeforeRun = runLength->select(r - 1) + 1; + + ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun; + if (i - pred >= bitsInRun) + return bitsBeforeRun + bitsInRun; + + return bitsBeforeRun + i - pred + 1; +} + +ulong GapEncode::rank0(ulong i) +{ + return i - rank(i) + 1; +} + +ulong GapEncode::select(ulong i) +{ + throw logic_error("GapEncode::select() is not implemented"); + return 0; +} + +ulong GapEncode::select0(ulong i) +{ + throw logic_error("GapEncode::select0() is not implemented"); + return 0; // To be implemented... +} + +/** + * Saving data fields: + ulong u; // Universe size, number of bits in B + ulong numberOfRuns; // Number of 1-bit runs + BSGAP *bitRun, // Contains 1-bit at the start of each 1-bit-run + *runLength; // Contains run lengths for each 1-bit-run + ulong totalLength; // Total length of runs +*/ +void GapEncode::Save(FILE *file) const +{ + if (std::fwrite(&(this->u), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("GapEncode::Save(): file write error (u)."); + if (std::fwrite(&(this->numberOfRuns), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("GapEncode::Save(): file write error (numberOfRuns)."); + + bitRun->Save(file); + runLength->Save(file); + + if (std::fwrite(&(this->totalLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("GapEncode::Save(): file write error (totalLength)."); +} + +GapEncode::GapEncode(FILE *file) +{ + if (std::fread(&(this->u), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("GapEncode::Load(): file read error (u)."); + if (std::fread(&(this->numberOfRuns), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("GapEncode::Load(): file read error (numberOfRuns)."); + + bitRun = new BSGAP(file); + runLength = new BSGAP(file); + + if (std::fread(&(this->totalLength), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("GapEncode::Load(): file read error (totalLength)."); +} diff --git a/GapEncode.h b/GapEncode.h new file mode 100644 index 0000000..a73e595 --- /dev/null +++ b/GapEncode.h @@ -0,0 +1,113 @@ +/** + * Gap encoding scheme + * Niko Välimäki + * + * + */ + +#ifndef _GapEncode_H_ +#define _GapEncode_H_ + +#include "Tools.h" +#include "BlockArray.h" +#include "BSGAP.h" + +class GapEncode +{ +public: + GapEncode(ulong *, ulong, bool); + GapEncode(FILE *); + ~GapEncode(); + + void Save(FILE *) const; + ulong rank(ulong); + ulong rankrun(ulong i) {return bitRun->rank(i);}; + ulong selectrun(ulong i) {return bitRun->select(i);}; + ulong rank0(ulong); + ulong select(ulong); + ulong select0(ulong); // Not implemented, yet... + bool IsBitSet(ulong i) { + ulong r = bitRun->rank(i); + if (r == 0) + return false; + ulong pred = bitRun->select(r); + + ulong bitsBeforeRun = 0; + if (r > 1) + bitsBeforeRun = runLength->select(r - 1) + 1; + + ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun; + if (i - pred >= bitsInRun) + return false; + + return true; + } + + /** + * Returns also the rank_1 of given position i + */ + bool IsBitSet(ulong i, ulong *rank) { + *rank = 0; + ulong r = bitRun->rank(i); + if (r == 0) + return false; + ulong pred = bitRun->select(r); + + ulong bitsBeforeRun = 0; + if (r > 1) + bitsBeforeRun = runLength->select(r - 1) + 1; + + ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun; + if (i - pred >= bitsInRun) + { + *rank = bitsBeforeRun + bitsInRun; + return false; + } + + *rank = bitsBeforeRun + i - pred + 1; + return true; + } + + /** + * Returns also the rank_1 of given position i + * and the length of the trailing 0/1-run. + */ + bool IsBitSet(ulong i, ulong *rank, ulong *runl) { + *rank = 0; + ulong r = bitRun->rank(i); + if (r == 0) + { + // Length of 0-bit run after pos i + *runl = bitRun->select(1) - i - 1; + return false; + } + ulong pred = bitRun->select(r); + + ulong bitsBeforeRun = 0; + if (r > 1) + bitsBeforeRun = runLength->select(r - 1) + 1; + + ulong bitsInRun = runLength->select(r) + 1 - bitsBeforeRun; + if (i - pred >= bitsInRun) + { + *rank = bitsBeforeRun + bitsInRun; + *runl = u - i - 1; + if (r < numberOfRuns) + *runl = bitRun->select(r+1) - i - 1; + return false; + } + + *rank = bitsBeforeRun + i - pred + 1; + *runl = bitsInRun - (i - pred) - 1; + return true; + } + +private: + ulong u; // Universe size, number of bits in B + ulong numberOfRuns; // Number of 1-bit runs + BSGAP *bitRun, // Contains 1-bit at the start of each 1-bit-run + *runLength; // Contains run lengths for each 1-bit-run + ulong totalLength; // Total length of runs +}; + +#endif diff --git a/RLWaveletTree.cpp b/RLWaveletTree.cpp new file mode 100644 index 0000000..dd043cb --- /dev/null +++ b/RLWaveletTree.cpp @@ -0,0 +1,383 @@ +/*************************************************************************** + * Copyright (C) 2007 by Veli Mäkinen * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +#include "RLWaveletTree.h" +using namespace std; + +void RLWaveletTree::remapAlphabet(uchar *text, ulong n) +{ + ulong *chars = new ulong[256/W+1]; + ulong i; + for (i = 0; i < 256/W+1; ++i) + chars[i] = 0; + // Set up flags: + for (i = 0; i < n; ++i) + Tools::SetField(chars, 1, text[i], 1); + charMap = new BitRank(chars, 256, true); + + // Remap text + for (i = 0; i < n; ++i) + text[i] = charMap->rank(text[i])-1; +} + +RLWaveletTree::RLWaveletTree(uchar *D, ulong n) { + remapAlphabet(D, n); +// std::cout << "Number of chars: " << charMap->rank(255) << ", maxlevel = " << Tools::CeilLog2(charMap->rank(255)) << std::endl; + + this->n = n; + this->maxlevel = Tools::CeilLog2(charMap->rank(255)); + + ulong i, k, value; + ulong** bucket = new ulong*[maxlevel+1]; + ulong *wtree = new ulong[n*maxlevel/(8*sizeof(ulong))+1]; + for (i = 0; i < n*maxlevel/(8*sizeof(ulong))+1; i ++) + wtree[i] = 0lu; + + // allocate memory for storing the locations of boudaries + for (k=0; k<=maxlevel; k++) { + bucket[k] = (ulong *)new ulong[(ulong)1 << k]; + for (i=0; i<((ulong)1 << k); i++) + bucket[k][i]=0; + } + + // compute the sizes of buckets + for (i=0; i> (maxlevel-k)]++; + } + + // cumulate the bucket sizes into boundary locations + for (k=0; k<=maxlevel; k++) { + for (i=1; i<((ulong)1 << k); i++) + bucket[k][i]+=bucket[k][i-1]; + for (i=((ulong)1 << k)-1; i>=1; i--) + bucket[k][i]=bucket[k][i-1]; + bucket[k][0]=0; + } + + //store the leaf information + leaves = new ulong[((ulong)1 << maxlevel)+1]; + for (k=0; k<(ulong)1<> (maxlevel-k)])++,(value>>(maxlevel-k-1)) & (ulong)1); + } + delete [] D; + // deallocate boundaries + for (k=0; k<=maxlevel; k++) + delete[] bucket[k]; + delete[] bucket; + + bitrank = new GapEncode(wtree, n*maxlevel, true); + prevRunLen = 0; + prevChar = 0; + prevPos = 0; +} + +RLWaveletTree::~RLWaveletTree() { + delete charMap; + delete [] leaves; + delete bitrank; +} + +ulong RLWaveletTree::rank(ulong c, ulong i) { +// if (i >= n) FIXME Replace with assert? +// std::cout << "RLWT::rank parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + c = charMap->rank(c)-1; + + // first round separately to avoid extra "if" + if ((c >> (maxlevel-1)) & (ulong)1) { // go to the right + j = bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + j -= bitrank->rank(j); + right = right>>1; + } + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if ((c >> (maxlevel-k-1)) & (ulong)1) { // go to the right + j = bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + j -= bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + return j+1; +} + +ulong RLWaveletTree::select(ulong c, ulong j) { + c = charMap->rank(c)-1; + ulong left=c, right=c+1; + ulong k, i=j-1, before; + for (k=maxlevel-1; k>0;--k) { + if ((c >> (maxlevel-k-1)) & (ulong)1) { // right side of parent + left -= right-left; + before = bitrank->rank(n*k+leaves[left]-1); + i = bitrank->select(before+i+1)-n*k-leaves[left]; + } + else { // left side of parent + right +=right-left; + before = n*k+leaves[left]-bitrank->rank(n*k+leaves[left]-1); + i = bitrank->select0(before+i+1)-n*k-leaves[left]; + } + } + // last level separately to avoid one extra if + if ((c >> (maxlevel-1)) & (ulong)1) // right side of parent + return bitrank->select(i+1); + else // left side of parent + return bitrank->select0(i+1); +} + +bool RLWaveletTree::IsCharAtPos(ulong c, ulong i) { + return charAtPos(i)==c; +} + + +ulong RLWaveletTree::charAtPos(ulong i) +{ +// if (i >= n) // FIXME Replace with assert? +// std::cout << "RLWT::charAtPos parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + ulong c = 0; + ulong rank_tmp = 0; + // first round separately to avoid extra "if" + if (bitrank->IsBitSet(j, &rank_tmp)) { // go to the right + c = c | ((ulong)1 << (maxlevel-1)); + j = rank_tmp-1; //bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + // c = c + j -= rank_tmp; // bitrank->rank(j); + right = right>>1; + } + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right + c = c | ((ulong)1 << (maxlevel-k-1)); + j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + // c = c + j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + return charMap->select(c+1); +} + +/* returns also the rank-1 of char at given position i */ +ulong RLWaveletTree::charAtPos(ulong i, ulong *rank) +{ +// if (i >= n) // FIXME Replace with assert? +// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + ulong c = 0; + ulong rank_tmp = 0; + // first round separately to avoid extra "if" + if (bitrank->IsBitSet(j, &rank_tmp)) { // FIXME j or leaves[left]+j ? + c = c | ((ulong)1 << (maxlevel-1)); + j = rank_tmp-1; //bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + // c = c + j -= rank_tmp; //bitrank->rank(j); + right = right>>1; + } + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp)) { // go to the right + c = c | ((ulong)1 << (maxlevel-k-1)); + j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + // c = c + j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + *rank = j; + return charMap->select(c+1); +} + +/* returns also the rank-1 of char at given position i + * and trailing char-run length */ +ulong RLWaveletTree::charAtPosRun(ulong i, ulong *rank) +{ + // Check for buffered data from previous call + if (i > prevPos && i <= prevPos + prevRunLen) + { + *rank = prevRank + (i-prevPos); + return prevChar; + } + + prevPos = i; +// if (i >= n) // FIXME Replace with assert? +// std::cout << "RLWT::charAtPos2 parameter i out of range" << std::endl; + ulong k; + ulong j=i; + ulong left = 0, right = (ulong)1 << maxlevel; + ulong before=0; + ulong c = 0; + ulong rank_tmp = 0; + ulong runl_tmp = 0; + // first round separately to avoid extra "if" + if (bitrank->IsBitSet(j, &rank_tmp, &prevRunLen)) { + if (j+prevRunLen > n) + prevRunLen = n-j-1; + c = c | ((ulong)1 << (maxlevel-1)); + j = rank_tmp-1; //bitrank->rank(j)-1; + left = right>>1; + } + else { // go to the left + // c = c + if (j+prevRunLen > n) + prevRunLen = n-j-1; + j -= rank_tmp; //bitrank->rank(j); + right = right>>1; + } + + // then the next without the extra if + for (k=1;krank(n*k+leaves[left]-1); + if (bitrank->IsBitSet(n*k+leaves[left]+j, &rank_tmp, &runl_tmp)) { // go to the right + if (leaves[left]+j+runl_tmp > leaves[right]) + runl_tmp = leaves[right] - (leaves[left]+j) - 1; + if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run? + prevRunLen = runl_tmp; + c = c | ((ulong)1 << (maxlevel-k-1)); + j = rank_tmp-before-1; //bitrank->rank(n*k+leaves[left]+j)-before-1; + left = (right+left)>>1; + } + else { // go to the left + if (leaves[left]+j+runl_tmp > leaves[right]) + runl_tmp = leaves[right] - (leaves[left]+j) - 1; + if (prevRunLen > runl_tmp) // FIXME Better way to keep track of a run? + prevRunLen = runl_tmp; + // c = c + j -= rank_tmp-before; //bitrank->rank(n*k+leaves[left]+j)-before; + right = (right+left)>>1; + } + // can never happen: if (left > right) return 0; + } + *rank = j; + prevRank = j; + prevChar = charMap->select(c+1); + return charMap->select(c+1); +} + +/** + * Saving data fields: + ulong n; + ulong maxlevel; + ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1 + GapEncode *bitrank; + BitRank *charMap; +*/ +void RLWaveletTree::Save(FILE *file) const +{ + if (std::fwrite(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (n)."); + if (std::fwrite(&(this->maxlevel), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (maxlevel)."); + + for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset) + { + if (std::fwrite(this->leaves + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (leaves)."); + } + + bitrank->Save(file); + + // Silly + ulong *chars = new ulong[256/W+1]; + // Set up flags: + for (ulong i = 0; i < 256; ++i) + Tools::SetField(chars, 1, i, charMap->IsBitSet(i)); + + for (ulong offset = 0; offset < 256/W+1; ++offset) + { + if (std::fwrite(chars + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Save(): file write error (chars)."); + } + + delete [] chars; +} + +/** + * Load from file + */ +RLWaveletTree::RLWaveletTree(FILE *file) +{ + if (std::fread(&(this->n), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (n)."); + if (std::fread(&(this->maxlevel), sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (maxlevel)."); + + for (ulong offset = 0; offset < ((ulong)1 << maxlevel)+1; ++offset) + { + if (std::fread(this->leaves + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (leaves)."); + } + + bitrank = new GapEncode(file); + + // Silly + ulong *chars = new ulong[256/W+1]; + // Read flags + for (ulong offset = 0; offset < 256/W+1; ++offset) + { + if (std::fread(chars + offset, sizeof(ulong), 1, file) != 1) + throw std::runtime_error("RLWaveletTree::Load(): file read error (chars)."); + } + charMap = new BitRank(chars, 256, true); + + prevRunLen = 0; // Init "buffer" values + prevChar = 0; + prevPos = 0; +} diff --git a/RLWaveletTree.h b/RLWaveletTree.h new file mode 100644 index 0000000..f3ed9ed --- /dev/null +++ b/RLWaveletTree.h @@ -0,0 +1,64 @@ +/*************************************************************************** + * Copyright (C) 2007 by Veli Mäkinen * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + ***************************************************************************/ + +/* + * Wavelet tree by Veli Mäkinen, + */ + +#ifndef _RLWAVELETTREE_H_ +#define _RLWAVELETTREE_H_ +#include "GapEncode.h" +#include "Tools.h" +#include "BitRank.h" + +class RLWaveletTree { +private: + ulong n; + ulong maxlevel; + ulong *leaves; // stores the leaf positions of characters 0..2^maxlevel-1 + GapEncode *bitrank; + BitRank *charMap; + void remapAlphabet(uchar*, ulong); + + ulong prevRunLen; // Used by charAtPosRun() + ulong prevPos; + ulong prevChar; + ulong prevRank; +public: + RLWaveletTree(uchar *, ulong); + RLWaveletTree(FILE *); + ~RLWaveletTree(); + + void Save(FILE *) const; + ulong rank(ulong c, ulong i); + + ulong select(ulong c, ulong j); + + bool IsCharAtPos(ulong c, ulong i); + + ulong charAtPos(ulong i); + ulong charAtPos(ulong i, ulong *rank); + ulong charAtPosRun(ulong i, ulong *rank); + double spaceInBits() { + return 0; // Not implemented + } +}; + +#endif diff --git a/TextCollection.h b/TextCollection.h index 6a4eb1c..ef0eb72 100644 --- a/TextCollection.h +++ b/TextCollection.h @@ -105,7 +105,7 @@ namespace SXSI * * Note: Parameters i and j are text positions inside the k'th text. */ - virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0; +// virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0; /** * Returns backwards (reverse) iterator to the end of i'th text * @@ -164,6 +164,12 @@ namespace SXSI typedef std::vector > full_result; virtual full_result FullContains(uchar const *) const = 0; + + /** + *Debug + * + */ + virtual TextPosition Lookup(TextPosition) const = 0; protected: // Protected constructor; call the static function InitTextCollection(). TextCollection() { }; diff --git a/Tools.h b/Tools.h index 45647e8..4def5e8 100644 --- a/Tools.h +++ b/Tools.h @@ -25,8 +25,13 @@ # define W 32 #endif +#ifndef WW #define WW (W*2) +#endif + +#ifndef Wminusone #define Wminusone (W-1) +#endif #ifndef uchar #define uchar unsigned char diff --git a/makefile b/makefile index 737484a..36094fe 100644 --- a/makefile +++ b/makefile @@ -1,7 +1,9 @@ CC = g++ -CPPFLAGS = -Wall -ansi -pedantic -g -O2 -DNDEBUG +LIBCDSPATH = ../libcds +CPPFLAGS = -Wall -ansi -g -O2 -I$(LIBCDSPATH)/includes/ -DNDEBUG +LIBCDSA = $(LIBCDSPATH)/lib/libcds.a -testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o +testTextCollection_obs = testTextCollection5.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o rbtree.o dynFMI.o RLWaveletTree.o GapEncode.o BSGAP.o ${LIBCDSA} all: $(testTextCollection_obs) -- 2.17.1