From 7d27a4450ed429e3b63e9d3ba7217a28cbbf9a31 Mon Sep 17 00:00:00 2001 From: kim Date: Mon, 24 Nov 2008 23:32:08 +0000 Subject: [PATCH 1/1] Initial Import git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@6 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- BitRank.cpp | 300 +++++++++++++ BitRank.h | 56 +++ CSA.cpp | 976 ++++++++++++++++++++++++++++++++++++++++ CSA.h | 231 ++++++++++ TextCollection.cpp | 16 + TextCollection.h | 168 +++++++ Tools.cpp | 95 ++++ Tools.h | 110 +++++ bittree.cpp | 989 +++++++++++++++++++++++++++++++++++++++++ bittree.h | 227 ++++++++++ dependencies.mk | 12 + dynFMI.cpp | 807 +++++++++++++++++++++++++++++++++ dynFMI.h | 233 ++++++++++ handle.cpp | 288 ++++++++++++ handle.h | 134 ++++++ makefile | 15 + pos.cpp | 170 +++++++ pos.h | 139 ++++++ rbtree.cpp | 454 +++++++++++++++++++ rbtree.h | 116 +++++ testTextCollection.cpp | 69 +++ 21 files changed, 5605 insertions(+) create mode 100644 BitRank.cpp create mode 100644 BitRank.h create mode 100644 CSA.cpp create mode 100644 CSA.h create mode 100644 TextCollection.cpp create mode 100644 TextCollection.h create mode 100644 Tools.cpp create mode 100644 Tools.h create mode 100644 bittree.cpp create mode 100644 bittree.h create mode 100644 dependencies.mk create mode 100644 dynFMI.cpp create mode 100644 dynFMI.h create mode 100644 handle.cpp create mode 100644 handle.h create mode 100644 makefile create mode 100644 pos.cpp create mode 100644 pos.h create mode 100644 rbtree.cpp create mode 100644 rbtree.h create mode 100644 testTextCollection.cpp diff --git a/BitRank.cpp b/BitRank.cpp new file mode 100644 index 0000000..61dcc42 --- /dev/null +++ b/BitRank.cpp @@ -0,0 +1,300 @@ +#include "BitRank.h" + +/***************************************************************************** + * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. * + * * + * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of the GNU Lesser General Public * + * License as published by the Free Software Foundation; either * + * version 2.1 of the License, or (at your option) any later version. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * + * Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License along with this library; if not, write to the Free Software * + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * + ****************************************************************************/ + +// Modified by Niko Välimäki + +///////////// +//Rank(B,i)// +///////////// +//This Class use a superblock size of 256-512 bits +//and a block size of 32-64 bits also + + +const unsigned char __popcount_tab[] = +{ +0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8, +}; + +const unsigned char select_tab[] = +{ +0,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +8,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, +}; + + +// bits needed to represent a number between 0 and n +inline ulong bits (ulong n){ + ulong b = 0; + while (n) { b++; n >>= 1; } + return b; +} + +#if W == 32 + // 32 bit version + inline unsigned popcount (register ulong x){ + return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff]; + } +#else + // 64 bit version + inline unsigned popcount (register ulong x){ + return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff] + __popcount_tab[(x >> 32) & 0xff] + __popcount_tab[(x >> 40) & 0xff] + __popcount_tab[(x >> 48) & 0xff] + __popcount_tab[(x >> 56) & 0xff]; + } +#endif + +inline unsigned popcount16 (register int x){ + return __popcount_tab[x & 0xff] + __popcount_tab[(x >> 8) & 0xff]; +} + +inline unsigned popcount8 (register int x){ + return __popcount_tab[x & 0xff]; +} + +BitRank::BitRank(ulong *bitarray, ulong n, bool owner) { + data=bitarray; + this->owner = owner; + this->n=n; // length of bitarray in bits + b = W; // b is a word + s=b*superFactor; + ulong aux=(n+1)%W; + if (aux != 0) + integers = (n+1)/W+1; + else + integers = (n+1)/W; + BuildRank(); +} + +BitRank::~BitRank() { + delete [] Rs; + delete [] Rb; + if (owner) delete [] data; +} + +//Build the rank (blocks and superblocks) +void BitRank::BuildRank() +{ + ulong num_sblock = n/s; + ulong num_block = n/b; + Rs = new ulong[num_sblock+1];//+1 we add the 0 pos + Rb = new uchar[num_block+1];//+1 we add the 0 pos + + ulong j; + Rs[0] = 0lu; + + for (j=1;j<=num_sblock;j++) + { + Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1]; + } + + Rb[0]=0; + for (ulong k=1;k<=num_block;k++) { + j = k / superFactor; + Rb[k]=BuildRankSub(j*superFactor, k%superFactor); + } +} + +ulong BitRank::BuildRankSub(ulong ini, ulong bloques){ + ulong rank=0,aux; + + + for(ulong i=ini;i>8]+Rb[i>>wordShift] + +popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1)); +} + +ulong BitRank::select(ulong x) { + // returns i such that x=rank(i) && rank(i-1) integers) + return n; + + j = data[left]; + + ones = popcount(j); + } + //sequential search using popcount over a char + left=left*b; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + } + } + } + + // then sequential search bit a bit + while (x>0) { + if (j&1lu) x--; + j=j>>1; + left++; + } + return left-1; +} + +ulong BitRank::select0(ulong x) { + // returns i such that x=rank0(i) && rank0(i-1) integers) + return n; + + j = data[left]; + zeros = W - popcount(j); + } + + //sequential search using popcount over a char + left=left*b; + rankmid = 8 - popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = 8 - popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = 8 - popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + } + } + } + + // then sequential search bit a bit + while (x>0) { + if (!(j&1lu)) x--; + j=j>>1; + left++; + } + return left - 1; +} + + +bool BitRank::IsBitSet(ulong i) { + return (1lu << (i % W)) & data[i/W]; +} + +ulong BitRank::NumberOfBits() { + return n; +} diff --git a/BitRank.h b/BitRank.h new file mode 100644 index 0000000..c7783d2 --- /dev/null +++ b/BitRank.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. * + * * + * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of the GNU Lesser General Public * + * License as published by the Free Software Foundation; either * + * version 2.1 of the License, or (at your option) any later version. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * + * Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License along with this library; if not, write to the Free Software * + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * + ****************************************************************************/ + +#ifndef _BITSTREAM_H_ +#define _BITSTREAM_H_ + +#include "Tools.h" + +class BitRank { +private: + // Check word length +#if W == 32 + static const unsigned wordShift = 5; + static const unsigned superFactor = 8; // 256 bit blocks +#else + static const unsigned wordShift = 6; + static const unsigned superFactor = 4; // 256 bit blocks +#endif + + ulong *data; //here is the bit-array + bool owner; + ulong n,integers; + unsigned b,s; + ulong *Rs; //superblock array + uchar *Rb; //block array + ulong BuildRankSub(ulong, ulong); //internal use of BuildRank + void BuildRank(); //crea indice para rank +public: + BitRank(ulong *, ulong, bool); + ~BitRank(); //destructor + ulong rank(ulong i); //Rank from 0 to n-1 + ulong select(ulong x); // gives the position of the x:th 1. + ulong select0(ulong x); // gives the position of the x:th 0. + + bool IsBitSet(ulong i); + ulong NumberOfBits(); +}; + +#endif diff --git a/CSA.cpp b/CSA.cpp new file mode 100644 index 0000000..a15c733 --- /dev/null +++ b/CSA.cpp @@ -0,0 +1,976 @@ +/****************************************************************************** + * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + *****************************************************************************/ +#include "CSA.h" +#include "TextCollection.h" +#include +#include +#include +#include +#include +#include +#include +#include // For strlen() +using std::vector; +using std::pair; +using std::make_pair; +using std::map; + +using SXSI::TextCollection; + + +//////////////////////////////////////////////////////////////////////////// +// Class CSA::THuffAlphabetRank + +CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) { + left = NULL; + right = NULL; + bitrank = NULL; + ch = s[0]; + leaf = false; + this->codetable = codetable; + + bool *B = new bool[n]; + TextPosition sum=0,i; + /* + for (i=0; i< n; i++) { + printf("%c:", (char)((int)s[i]-128)); + for (r=0;r 100) return;*/ + for (i=0;i 0) + sfirst = new uchar[n-sum]; + //if (sum > 0) + ssecond = new uchar[sum]; + unsigned j=0,k=0; + for (i=0;i 0) { + left = new THuffAlphabetRank(sfirst,j,codetable,level+1); + delete [] sfirst; + //} + //if (k>0) { + right = new THuffAlphabetRank(ssecond,k,codetable,level+1); + delete [] ssecond; + //} +} + + +bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) { + // testing that the code works correctly + int C[256]; + unsigned i,j; + bool correct=true; + for (j=0;j<256;j++) + C[j] = 0; + for (i=0;i%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i)); + } + } + return correct; +} + +CSA::THuffAlphabetRank::~THuffAlphabetRank() { + if (left!=NULL) delete left; + if (right!=NULL) delete right; + if (bitrank!=NULL) + delete bitrank; +} + + +//////////////////////////////////////////////////////////////////////////// +// Class CSA + +/** + * Constructor inits an empty dynamic FM-index. + * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. + */ +CSA::CSA(unsigned samplerate) + : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), + codetable(0), dynFMI(0) +{ + this->samplerate = samplerate; + + // FIXME TODO : DynFMI needs distribution of characters before hand + // This will create fully balanced wavelet tree for all chars [0, 255]. + uchar temp[256]; + for (unsigned i = 0; i < 255; ++i) + temp[i] = i+1; + temp[255] = 0; + dynFMI = new DynFMI(temp, 1, 255, false); +} + +/** + * Insert new text + * + * Given text must include end-marker. + * Text identifiers are numbered starting from 0. + */ +void CSA::InsertText(uchar const * text) +{ + // Sanity check: + assert(dynFMI != 0); + + TextPosition m = std::strlen((char *)text) + 1; + // Store text length and starting position + textLength.push_back(make_pair(m, n)); + this->n += m; + dynFMI->addText(text, m); +} + +void CSA::MakeStatic() +{ + // Sanity check: + if (dynFMI == 0) + { + std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl; + std::exit(0); + } + + uchar *bwt = dynFMI->getBWT(); + /*printf("1234567890123456789\n"); + for (TextPosition i = 0; i < n; i ++) + if (bwt[i] < 50) + printf("%d", (int)bwt[i]); + else + printf("%c", bwt[i]); + printf("\n"); + */ + +/* for (TextPosition i = 1; i <= n; i ++) + { + printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i)); + }*/ + + // Sanity check + assert(textLength.size() == dynFMI->getCollectionSize()); + + delete dynFMI; + dynFMI = 0; + + ulong i, min = 0, + max; + for (i=0;i<256;i++) + C[i]=0; + for (i=0;i0) {min = i; break;} + for (i=255;i>=min;--i) + if (C[i]>0) {max = i; break;} + + // Print frequencies +/* for(i = 0; i < 256; i++) + if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); + fflush(stdout);*/ + + ulong prev=C[0], temp; + C[0]=0; + for (i=1;i<256;i++) { + temp = C[i]; + C[i]=C[i-1]+prev; + prev = temp; + } + this->codetable = node::makecodetable(bwt,n); + alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); + //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); + +/* for (i = 0; i < n; ++i) + { + uchar c = alphabetrank->charAtPos(i); + TextPosition j = C[c]+alphabetrank->rank(c, i)-1; + printf("LF[%lu] = %lu\n", i, j); + }*/ + + // Calculate BWT end-marker position (of last inserted text) + i = 0; + while (bwt[i] != '\0') + { + uchar c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c, i)-1; + } + bwtEndPos = i; + //printf("bwtEndPos = %lu\n", bwtEndPos); + + delete [] bwt; + + // Make sampling tables + maketables(); + // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small + // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+... + //printf("FMindex takes %d B\n", + // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8); +} + + +uchar* CSA::GetText(DocId k) const +{ + assert(k < (DocId)textLength.size()); + + uchar* result = new uchar[textLength[k].first]; + TextPosition i = k; + TextPosition j = textLength[k].first-1; + result[j] = '\0'; + + uchar c = alphabetrank->charAtPos(i); + while (c != '\0') + { + --j; + result[j] = c; + i = C[c]+alphabetrank->rank(c,i)-1; + + c = alphabetrank->charAtPos(i); // "next" char. + } + assert(j == 0); + return result; +} + +uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const +{ + assert(k < (DocId)textLength.size()); + assert(j < textLength[k].first); + assert(i <= j); + + // Start position of k'th text + ulong start = textLength[k].second; + + return Substring(i + start, j-i+1); +} + +/** + * Existential queries + */ +bool CSA::IsPrefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Check for end-marker(s) in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + if (it != endmarkers.end() && it->first <= ep) + return true; + return false; +} + +bool CSA::IsSuffix(uchar const *pattern) const +{ + // Here counting is as fast as isSuffix(): + if (CountSuffix(pattern) > 0) + return true; + return false; +} + +bool CSA::IsEqual(uchar const *pattern) const +{ + TextPosition m = std::strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Check for end-marker(s) in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + if (it != endmarkers.end() && it->first <= ep) + return true; + return false; +} + +bool CSA::IsContains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + // Just check if pattern exists somewhere + ulong count = Search(pattern, m, &sp, &ep); + + if (count > 0) + return true; + return false; +} + +bool CSA::IsLessThan(uchar const*) const +{ + // TODO + return false; +} + +/** + * Counting queries + */ +unsigned CSA::CountPrefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Count end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + ++ count; + ++ it; + } + + return count; +} + +unsigned CSA::CountSuffix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + // Search with end-marker + unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep); + + return count; +} + +unsigned CSA::CountEqual(uchar const *pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Count end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + ++ count; + ++ it; + } + + return count; +} + +unsigned CSA::CountContains(uchar const * pattern) const +{ + // Here counting is as slow as fetching the result set + // because we would have to filter out occ's that fall within same document. + TextCollection::document_result result = Contains(pattern); + return result.size(); +} + +unsigned CSA::CountLessThan(uchar const * pattern) const +{ + // TODO + return 0; +} + +/** + * Document reporting queries + */ +TextCollection::document_result CSA::Prefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((it->second).first + 1) % textLength.size(); + result.push_back(docId); + ++ count; + ++ it; + } + + return result; +} + +TextCollection::document_result CSA::Suffix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search with end-marker + Search(pattern, m+1, &sp, &ep); + + TextCollection::document_result result; + + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->charAtPos(i); + while (c != '\0' && !sampled->IsBitSet(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->charAtPos(i); + } + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + // End-marker that we found belongs to the "previous" doc in collection: + DocId docId = (endm.first + 1) % textLength.size(); + result.push_back(docId); + } + else + { + TextPosition textPos = suffixes[sampled->rank(i)-1]; + result.push_back(DocIdAtTextPos(textPos)); + } + } + + return result; +} + +TextCollection::document_result CSA::Equal(uchar const *pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + // End-marker that we found belongs to the "previous" doc in collection: + DocId docId = ((it->second).first + 1) % textLength.size(); + result.push_back(docId); + ++ count; + ++ it; + } + + return result; +} + + +TextCollection::document_result CSA::Contains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + // We want unique document indentifiers, using std::set to collect them + std::set resultSet; + + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->charAtPos(i); + while (c != '\0' && !sampled->IsBitSet(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->charAtPos(i); + } + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + // End-marker we found belongs to the "previous" doc in collection + DocId docId = (endm.first + 1) % textLength.size(); + resultSet.insert(docId); + } + else + { + TextPosition textPos = suffixes[sampled->rank(i)-1]; + resultSet.insert(DocIdAtTextPos(textPos)); + } + } + + // Convert std::set to std::vector (TODO better way to construct result vector?) + TextCollection::document_result result; + for (std::set::iterator it = resultSet.begin(); it != resultSet.end(); ++it) + result.push_back(*it); + return result; +} + +TextCollection::document_result CSA::LessThan(uchar const * pattern) const +{ + // TODO + return document_result(); +} + +/** + * Full result set queries + */ +TextCollection::full_result CSA::FullContains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return full_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + full_result result; + + // Report each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->charAtPos(i); + while (c != '\0' && !sampled->IsBitSet(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->charAtPos(i); + ++ dist; + } + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + // End-marker we found belongs to the "previous" doc in the collection: + DocId docId = (endm.first+1)%textLength.size(); + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = suffixes[sampled->rank(i)-1] + dist; + // Read document identifier and its starting position in text order + DocId docId = DocIdAtTextPos(textPos); + result.push_back(make_pair(docId, textPos - textLength[docId].second)); + } + } + + return result; +} + + +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const +{ + assert(i < n); + + DocId a = 0; + DocId b = textLength.size() - 1; + while (a < b) + { + DocId c = a + (b - a)/2; + if (textLength[c].second > i) + b = c - 1; + else if (textLength[c+1].second > i) + return c; + else + a = c + 1; + } + + assert(a < (DocId)textLength.size()); + assert(i > textLength[a].second); + assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second)); + return a; +} + +void CSA::Load(FILE *filename, unsigned samplerate) +{ + // TODO + +} + +void CSA::Save(FILE *filename) +{ + // TODO +} + + +/** + * Rest of the functions follow... + */ + +uchar * CSA::Substring(TextPosition i, TextPosition l) const +{ + uchar *result = new uchar[l + 1]; + if (l == 0) + { + result[0] = 0u; + return result; + } + + TextPosition dist; + TextPosition k = i + l - 1; + // Check for end of the string + if (k > n - 1) + { + l -= k - n + 1; + k = n - 1; + } + + TextPosition skip = samplerate - k % samplerate - 1; + TextPosition j; + if (k / samplerate + 1 >= n / samplerate) + { + j = bwtEndPos; + skip = n - k - 1; + } + else + { + j = positions[k/samplerate+1]; + //cout << samplerate << ' ' << j << '\n'; + } + + for (dist = 0; dist < skip + l; dist++) + { + int c = alphabetrank->charAtPos(j); + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(j))->second; + j = endm.first; // LF-mapping for end-marker + } + else + j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping + if (dist >= skip) + result[l + skip - dist - 1] = c; + } + result[l] = 0u; + return result; +} + + /*TextPosition CSA::inverse(TextPosition i) +{ + TextPosition skip = samplerate - i % samplerate; + TextPosition j; + if (i / samplerate + 1 >= n / samplerate) + { + j = bwtEndPos; + skip = n - i; + } + else + { + j = positions[i/samplerate+1]; + //cout << samplerate << ' ' << j << '\n'; + } + + while (skip > 0) + { + int c = alphabetrank->charAtPos(j); + j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping + skip --; + } + return j; + }*/ + +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + int c = (int)pattern[m-1]; + TextPosition i=m-1; + TextPosition sp = C[c]; + TextPosition ep = C[c+1]-1; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + sp = C[c]+alphabetrank->rank(c,sp-1); + ep = C[c]+alphabetrank->rank(c,ep)-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; + } + + /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) +{ + sp = C[(int)c]+alphabetrank->rank(c,sp-1); + ep = C[(int)c]+alphabetrank->rank(c,ep)-1; + + if (sp<=ep) + return ep - sp + 1; + else + return 0; + }*/ + +CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma) +{ + TextPosition dist=0; + while (!sampled->IsBitSet(i)) + { + int c = alphabetrank->charAtPos(i); + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + return endm.second + dist; // returns text position. + } + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + ++dist; + } + + return suffixes[sampled->rank(i)-1]+dist; +} + +CSA::~CSA() { + delete alphabetrank; + delete sampled; + delete [] suffixes; + delete [] positions; + delete [] codetable; +} + + +void CSA::maketables() +{ + ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; + + ulong *sampledpositions = new ulong[n/W+1]; + suffixes = new ulong[sampleLength]; + positions = new ulong[sampleLength]; + + ulong i,j=0; + for (i=0;iGetPos(i) + x=(i==n-1)?0:i+1; + + if (x % samplerate == 0) { + Tools::SetField(sampledpositions,1,p,1); + positions[x/samplerate] = p; + } + + uchar c = alphabetrank->charAtPos(p); + if (c == '\0') + { + // Record the order of end-markers in BWT: + --textId; + endmarkers[p] = make_pair(textId, (TextPosition)x); + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). + p = textId; // Correct LF-mapping to the last char of the previous text. + } + else + p = C[c]+alphabetrank->rank(c, p)-1; + } + assert(textId == 0); + /*for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it) + { + printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second); + }*/ + + sampled = new BitRank(sampledpositions,n,true); + + //suffixes: + for(i=0; irank(positions[i]); + if (j==0) j=sampleLength; + suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate; + } +} + +uchar * CSA::LoadFromFile(const char *filename) +{ + uchar *s; + std::ifstream file (filename, std::ios::in|std::ios::binary); + if (file.is_open()) + { + std::cerr << "Loading CSA from file: " << filename << std::endl; + file.read((char *)&bwtEndPos, sizeof(TextPosition)); + s = new uchar[n]; + for (ulong offset = 0; offset < n; offset ++) + file.read((char *)(s + offset), sizeof(char)); + file.close(); + } + else + { + std::cerr << "Unable to open file " << filename << std::endl; + exit(1); + } + return s; +} + +void CSA::SaveToFile(const char *filename, uchar *bwt) +{ + std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc); + if (file.is_open()) + { + std::cerr << "Writing CSA to file: " << filename << std::endl; + file.write((char *)&bwtEndPos, sizeof(TextPosition)); + std::cerr << "Writing BWT of " << n << " bytes." << std::endl; + for (ulong offset = 0; offset < n; offset ++) + file.write((char *)(bwt + offset), sizeof(char)); + file.close(); + } + else + { + std::cerr << "Unable to open file " << filename << std::endl; + exit(1); + } +} + +/*uchar * CSA::BWT(uchar *text) +{ + uchar *s; + + DynFMI *wt = new DynFMI((uchar *) text, n); + s = wt->getBWT(); + for (ulong i=0;i, std::greater > q; +// +// First I push all the leaf nodes into the queue +// + for ( unsigned int i = 0 ; i < 256 ; i++ ) + if ( result[ i ].count ) + q.push(node( i, result[ i ].count ) ); +// +// This loop removes the two smallest nodes from the +// queue. It creates a new internal node that has +// those two nodes as children. The new internal node +// is then inserted into the priority queue. When there +// is only one node in the priority queue, the tree +// is complete. +// + + while ( q.size() > 1 ) { + node *child0 = new node( q.top() ); + q.pop(); + node *child1 = new node( q.top() ); + q.pop(); + q.push( node( child0, child1 ) ); + } +// +// Now I compute and return the codetable +// + q.top().maketable(0u,0u, result); + q.pop(); + return result; +} + + +void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const +{ + if ( child0 ) + { + child0->maketable( SetBit(code,bits,0), bits+1, codetable ); + child1->maketable( SetBit(code,bits,1), bits+1, codetable ); + delete child0; + delete child1; + } + else + { + codetable[value].code = code; + codetable[value].bits = bits; + } +} + +void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts ) +{ + ulong i; + for (i = 0 ; i < 256 ; i++ ) + counts[ i ].count = 0; + for (i=0; i +#include +#include "BitRank.h" +#include "dynFMI.h" +#include "TextCollection.h" + +/** + * Implementation of the TextCollection interface + * + * Implementation notes: + * + * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree + * which requires O(n log \sigma) bits of memory. If we know the distribution + * of characters beforehand, space can easily be made O(nH_0). + * + * The method MakeStatic() constructs a Succinct Suffix Array with a huffman + * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME) + * + * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used + * with any FM-index variant supporting LF-mapping. + */ +class CSA : public SXSI::TextCollection { +public: + /** + * Constructor with (default) samplerate + */ + explicit CSA(unsigned); + ~CSA(); + /** + * Following functions implement the interface described in TextCollection.h. + * For details/documentation, see TextCollection.h. + */ + void InsertText(uchar const *); + void MakeStatic(); + uchar* GetText(DocId) const; + uchar* GetText(DocId, TextPosition, TextPosition) const; + + bool IsPrefix(uchar const *) const; + bool IsSuffix(uchar const *) const; + bool IsEqual(uchar const *) const; + bool IsContains(uchar const *) const; + bool IsLessThan(uchar const *) const; + + unsigned CountPrefix(uchar const *) const; + unsigned CountSuffix(uchar const *) const; + unsigned CountEqual(uchar const *) const; + unsigned CountContains(uchar const *) const; + unsigned CountLessThan(const unsigned char*) const; + + // document_result is inherited from SXSI::TextCollection. + document_result Prefix(uchar const *) const; + document_result Suffix(uchar const *) const; + document_result Equal(uchar const *) const; + document_result Contains(uchar const *) const; + document_result LessThan(uchar const *) const; + + // full_result is inherited from SXSI::TextCollection. + full_result FullContains(uchar const *) const; + + // TODO implement: + void Load(FILE *, unsigned); + void Save(FILE *); + +private: + class TCodeEntry { + public: + unsigned count; + unsigned bits; + unsigned code; + TCodeEntry() {count=0;bits=0;code=0u;}; + }; + + + class THuffAlphabetRank { + // using fixed 0...255 alphabet + private: + BitRank *bitrank; + THuffAlphabetRank *left; + THuffAlphabetRank *right; + TCodeEntry *codetable; + uchar ch; + bool leaf; + public: + THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); + ~THuffAlphabetRank(); + bool Test(uchar *, TextPosition); + + inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i + THuffAlphabetRank const * temp=this; + if (codetable[c].count == 0) return 0; + unsigned level = 0; + unsigned code = codetable[c].code; + while (!temp->leaf) { + if ((code & (1u<bitrank->rank(i); + temp = temp->left; + } + else { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } + ++level; + } + return i+1; + }; + inline bool IsCharAtPos(int c, TextPosition i) const { + THuffAlphabetRank const * temp=this; + if (codetable[c].count == 0) return false; + unsigned level = 0; + unsigned code = codetable[c].code; + while (!temp->leaf) { + if ((code & (1u<bitrank->IsBitSet(i)) return false; + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + else { + if (!temp->bitrank->IsBitSet(i)) return false; + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } + ++level; + } + return true; + } + inline int charAtPos(TextPosition i) const { + THuffAlphabetRank const * temp=this; + while (!temp->leaf) { + if (temp->bitrank->IsBitSet(i)) { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } + else { + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + } + return (int)temp->ch; + } + }; + + class node { + private: + unsigned weight; + uchar value; + node *child0; + node *child1; + + void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const; + static void count_chars(uchar *, TextPosition , TCodeEntry *); + static unsigned SetBit(unsigned , unsigned , unsigned ); + public: + node( unsigned char c = 0, unsigned i = 0 ) { + value = c; + weight = i; + child0 = 0; + child1 = 0; + } + + node( node* c0, node *c1 ) { + value = 0; + weight = c0->weight + c1->weight; + child0 = c0; + child1 = c1; + } + + + bool operator>( const node &a ) const { + return weight > a.weight; + } + + static TCodeEntry *makecodetable(uchar *, TextPosition); + }; + + static const unsigned char print = 1; + static const unsigned char report = 1; + TextPosition n; + unsigned samplerate; + ulong C[256]; + TextPosition bwtEndPos; + THuffAlphabetRank *alphabetrank; + BitRank *sampled; + ulong *suffixes; + ulong *positions; + TCodeEntry *codetable; + DynFMI *dynFMI; + // Map from end-marker in BWT to pair (textId, sampled text position) + std::map > endmarkers; + // Vector of pairs of + std::vector > textLength; + + // Private methods + uchar * BWT(uchar *); + uchar * LoadFromFile(const char *); + void SaveToFile(const char *, uchar *); + void maketables(); + + // Following are not part of the public API + DocId DocIdAtTextPos(TextPosition) const; + ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; + TextPosition Lookup(TextPosition) const; + TextPosition Inverse(TextPosition) const; + TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; + TextPosition Psi(TextPosition) const; + uchar * Substring(TextPosition, TextPosition) const; +}; + +#endif diff --git a/TextCollection.cpp b/TextCollection.cpp new file mode 100644 index 0000000..0300899 --- /dev/null +++ b/TextCollection.cpp @@ -0,0 +1,16 @@ +#include "TextCollection.h" +#include "CSA.h" + +namespace SXSI +{ + /** + * Init text collection + * + * See CSA.h for more details. + */ + TextCollection * TextCollection::InitTextCollection(unsigned samplerate) + { + TextCollection *result = new CSA(samplerate); + return result; + } +} diff --git a/TextCollection.h b/TextCollection.h new file mode 100644 index 0000000..e7e36e2 --- /dev/null +++ b/TextCollection.h @@ -0,0 +1,168 @@ +/****************************************************************************** + * Copyright (C) 2008 by Niko Valimaki * + * Text collection interface for an in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef _SXSI_TextCollection_h_ +#define _SXSI_TextCollection_h_ + +#include "Tools.h" // Defines ulong and uchar. +#include +#include // Defines std::pair. + +// Default samplerate for suffix array samples +#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64 + +namespace SXSI +{ + /** + * General interface for a text collection + * + * Class is virtual, make objects by calling + * the static method InitTextCollection(). + */ + class TextCollection + { + public: + // Type of document identifier + typedef int DocId; + // Type for text position (FIXME ulong or long?) + typedef ulong TextPosition; + + /** + * Init an instance of a text collection object + * + * Returns a pointer to an object implementing this interface. + */ + static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE); + /** + * Load from a file + * + * New samplerate can be given, otherwise will use the one specified in the save file! + * Note: This is not a static method; call InitTextCollection() first to get the object handle. + * + * Throws an exception if something goes wrong (unlikely since we are passing a file descriptor). + * + */ + virtual void Load(FILE *, unsigned samplerate = 0) = 0; + /** + * Save data structure into a file + */ + virtual void Save(FILE *) = 0; + /** + * Virtual destructor + */ + virtual ~TextCollection() { }; + /** + * Insert text + * + * Must be a zero-terminated string from alphabet [1,255]. + * Can not be called after makeStatic(). + * The i'th text insertion gets an identifier value i-1. + * In other words, document identifiers start from 0. + */ + virtual void InsertText(uchar const *) = 0; + /** + * Make static + * + * Convert to a static collection; reduces space and time complexities. + * New texts can not be inserted after this operation. + */ + virtual void MakeStatic() = 0; + + /** + * Displaying content + * + * Returns the i'th text in the collection. + * The numbering starts from 0. + */ + virtual uchar* GetText(DocId) const = 0; + /** + * Returns substring [i, j] of k'th text + * + * Note: Parameters i and j are text positions inside the k'th text. + */ + virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0; + /** + * Returns backwards (reverse) iterator to the end of i'th text + * + * Note: Do we need this? + * Forward iterator would be really in-efficient compared to + * getText(k) and getText(k, i, j). + * + * TODO Define and implement const_reverse_iterator. + */ + //const_reverse_iterator rend(DocId) const; + + /** + * Existential queries + */ + // Is there a text prefixed by given string? + virtual bool IsPrefix(uchar const *) const = 0; + // Is there a text having given string as a suffix? + virtual bool IsSuffix(uchar const *) const = 0; + // Is there a text that equals given string? + virtual bool IsEqual(uchar const *) const = 0; + // Does a text contain given string? + virtual bool IsContains(uchar const *) const = 0; + // Is there a text that is lexicographically less than given string? + virtual bool IsLessThan(uchar const *) const = 0; + + /** + * Counting queries + * + * Result is the number of documents. + */ + virtual unsigned CountPrefix(uchar const *) const = 0; + virtual unsigned CountSuffix(uchar const *) const = 0; + virtual unsigned CountEqual(uchar const *) const = 0; + virtual unsigned CountContains(uchar const *) const = 0; + virtual unsigned CountLessThan(uchar const *) const = 0; + + /** + * Document reporting queries + * + * Result is a vector of document id's in some undefined order. + */ + // Data type for results + typedef std::vector document_result; + virtual document_result Prefix(uchar const *) const = 0; + virtual document_result Suffix(uchar const *) const = 0; + virtual document_result Equal(uchar const *) const = 0; + virtual document_result Contains(uchar const *) const = 0; + virtual document_result LessThan(uchar const *) const = 0; + + /** + * Full reporting queries + * + * Result is a vector of pairs in some undefined order. + */ + // Data type for results + typedef std::vector > full_result; + virtual full_result FullContains(uchar const *) const = 0; + + protected: + // Protected constructor; call the static function InitTextCollection(). + TextCollection() { }; + + // No copy constructor or assignment + TextCollection(TextCollection const&); + TextCollection& operator = (TextCollection const&); + }; +} +#endif diff --git a/Tools.cpp b/Tools.cpp new file mode 100644 index 0000000..d7260d9 --- /dev/null +++ b/Tools.cpp @@ -0,0 +1,95 @@ +/* + * Collection of basic tools and defines + */ + +#include "Tools.h" +#include + + +time_t Tools::startTime; + +void Tools::StartTimer() +{ + startTime = time(NULL); +} + +double Tools::GetTime() +{ + time_t stopTime = time(NULL); + return difftime( stopTime, startTime ); +} + +uchar * Tools::GetRandomString(unsigned min, unsigned max, unsigned &alphabetSize) +{ + unsigned len = std::rand() % (max - min) + min; + alphabetSize = std::rand() % 26 + 1; + uchar* temp = new uchar[len + 2]; + for (unsigned i = 0; i < len; i++) + temp[i] = 97 + std::rand() % alphabetSize; + temp[len] = 0u ;temp[len+1] = '\0'; + return temp; +} + + +void Tools::PrintBitSequence(ulong *A, ulong len) +{ + for(ulong i = 0; i < len; i++) + if (GetField(A, 1, i)) + std::cout << "1"; + else + std::cout << "0"; + std::cout << "\n"; +} + +unsigned Tools::FloorLog2(ulong i) +{ + unsigned b = 0; + if (i == 0) + return 0; + while (i) + { + b++; + i >>= 1; + } + return b - 1; +} + +unsigned Tools::CeilLog2(ulong i) +{ + unsigned j = FloorLog2(i); + if ((ulong)(1lu << j) != i) + return j + 1; + + return j; +} + +uchar * Tools::GetFileContents(char *filename, ulong maxSize) +{ + std::ifstream::pos_type posSize; + std::ifstream file ((char *)filename, std::ios::in|std::ios::binary|std::ios::ate); + if (file.is_open()) + { + posSize = file.tellg(); + ulong size = posSize; + if (maxSize != 0 && size > maxSize) + size = maxSize; + char *memblock = new char [size + 1]; + file.seekg (0, std::ios::beg); + file.read (memblock, size); + memblock[size] = '\0'; + file.close(); + return (uchar *)memblock; + } + else + return 0; +} + +unsigned Tools::bits (ulong n) + + { unsigned b = 0; + while (n) + { b++; n >>= 1; } + return b; + } + + diff --git a/Tools.h b/Tools.h new file mode 100644 index 0000000..121e46e --- /dev/null +++ b/Tools.h @@ -0,0 +1,110 @@ +/* + * Collection of basic tools and defines + */ + + +#ifndef _TOOLS_H_ +#define _TOOLS_H_ + +#include +#include +#include +#include + +// Generates an error if __WORDSIZE is not defined +#ifndef __WORDSIZE +#error Missing definition of __WORDSIZE; Please define __WORDSIZE in Tools.h! +#endif + +// Check word length on GNU C/C++: +// __WORDSIZE should be defined in , which is #included from +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + +#define WW (W*2) +#define Wminusone (W-1) + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + + +class Tools +{ +private: + static time_t startTime; +public: + static void StartTimer(); + static double GetTime(); + static uchar * GetRandomString(unsigned, unsigned, unsigned &); + static void PrintBitSequence(ulong *, ulong); + static uchar * GetFileContents(char *, ulong =0); + static ulong * bp2bitstream(uchar *); + static unsigned FloorLog2(ulong); + static unsigned CeilLog2(ulong); + static unsigned bits (ulong); + + static inline void SetField(ulong *A, register unsigned len, register ulong index, register ulong x) + { + ulong i = index * len / W, + j = index * len - i * W; + ulong mask = (j+len < W ? ~0lu << j+len : 0) + | (W-j < W ? ~0lu >> (W-j) : 0); + A[i] = (A[i] & mask) | x << j; + if (j + len > W) + { + mask = ((~0lu) << (len + j - W)); + A[i+1] = (A[i+1] & mask)| x >> (W - j); + } + } + + static inline ulong GetField(ulong *A, register unsigned len, register ulong index) + { + register ulong i = index * len / W, + j = index * len - W * i, + result; + if (j + len <= W) + result = (A[i] << (W - j - len)) >> (W - len); + else + { + result = A[i] >> j; + result = result | (A[i+1] << (WW - j - len)) >> (W - len); + } + return result; + } + + + static inline ulong GetVariableField(ulong *A, register unsigned len, register ulong index) + { + register ulong i=index/W, j=index-W*i, result; + if (j+len <= W) + result = (A[i] << (W-j-len)) >> (W-len); + else { + result = A[i] >> j; + result = result | (A[i+1] << (WW-j-len)) >> (W-len); + } + return result; + } + + static inline void SetVariableField(ulong *A, register unsigned len, register ulong index, register ulong x) { + ulong i=index/W, + j=index-i*W; + ulong mask = (j+len < W ? ~0lu << j+len : 0) + | (W-j < W ? ~0lu >> (W-j) : 0); + A[i] = (A[i] & mask) | x << j; + if (j+len>W) { + mask = ((~0lu) << (len+j-W)); + A[i+1] = (A[i+1] & mask)| x >> (W-j); + } + } + + +}; + +#endif diff --git a/bittree.cpp b/bittree.cpp new file mode 100644 index 0000000..0003461 --- /dev/null +++ b/bittree.cpp @@ -0,0 +1,989 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +#include "bittree.h" + + +using std::cout; +using std::endl; +using std::cerr; +using std::bitset; +using std::ostream; + +//std::ostream& operator<<(std::ostream& os, node& n) { ... } + + + BVTree::~BVTree(){ + delete tempbit; + tempbit = 0; + } + + + +/** +BVTree::BVTree(char *readfile){ + + std::ifstream file(readfile); + if (!file) + { + cout << "error reading file "<< readfile <<" !\n"; + exit(EXIT_FAILURE); + } + + + tempnil = (BVNode*) ((RBTree*) this)->nil; + tempbit = new bitset<2*logn>; + + + char c; + bitset<8> bs; + + while (file.get(c)) + { + bs = c; + + for (int i=7; i>=0; i--) + appendBit(bs[i]); + } + +} +**/ + + + + +void BVTree::iterateReset(){ + iterate=1; + iterateLocal=0; + if (root!=nil) { + iterateNode = (BVNode*) treeMinimum(root); + iterateRank = ((*iterateNode->block)[0])?1:0; + } else { + iterateNode=getNil(); + iterateRank = 0; + } +} + +bool BVTree::iterateGetBit(){ + return ((*iterateNode->block)[iterateLocal]); +} + +ulong BVTree::iterateGetRank(bool bit){ + if (bit) return iterateRank; + else return (iterate - iterateRank); +} + +bool BVTree::iterateNext(){ + iterate++; + + if (iterate > getPositions()) return false; + + + if (iterateLocal < iterateNode->myPositions-1) { + iterateLocal++; + } else { + // jump to next leaf; + iterateNode = (BVNode*) treeSuccessor(iterateNode); + #ifndef NDEBUG + if (iterateNode==getNil()) { + cout << "iterateNode==getNil()" << endl; + return false; + } + #endif + iterateLocal=0; + } + if ((*iterateNode->block)[iterateLocal]) iterateRank++; + + return true; +} + +ulong BVTree::getPositions(){ + + if (getRoot() == getNil()) return 0; + return getRoot()->subTreePositions; +} + +ulong BVTree::getRank(){ + + if (getRoot() == getNil()) return 0; + return getRoot()->subTreeRank; +} + + +void BVTree::printNode(BVNode *n){ + int commas=(2*logn)/10; + int commashift = -1; + + + cout << "address: " << n << endl; + + if (n->block != 0 && (n!=getNil())) + { + int size = 2*logn + 1 +commas; + + char *myblock = (char *) new char[size]; + if (myblock == 0) { + cerr << "error: printNode: myblock == 0" << endl; + exit(0); + } + + //read: i, write: i+commashift + for (int i=0; i<(int)(n->myPositions); i++){ + if (i%10 == 0) commashift++; + myblock[i+commashift]='0' + (*n->block)[i]; + if (i+commashift >= size-1) { + cerr << "X) printNode: array wrong index" << endl; + exit(0); + } + + } + + cout << "size=" << size << endl; + cout << "n->myPositions=" << n->myPositions << endl; + for (int i=n->myPositions; i<(2*logn); i++){ + if ((i%10) == 0) commashift++; + + myblock[i+commashift]='-'; + if (i+commashift > size-2) { + cerr << "A) printNode: array wrong index" << endl; + exit(0); + } + } + + commashift = 0; + for (int i=10; i < 2*logn; i++){ + if (i%10 == 0) + { + if (i+commashift >= size-2) { + cerr << "B) printNode: array wrong index" << endl; + exit(0); + } + + myblock[i+commashift]=','; + commashift++; + } + + } + + myblock[size - 1]='\0'; + + cout << "block: \"" << myblock << "\"" << endl; + delete myblock; + + } + else + cout << "block: none" << endl; + + + cout << "myPositions: " << n->myPositions << endl; + cout << "myRank: " << n->myRank << endl; + cout << "subTreePositions: " << n->subTreePositions << endl; + cout << "subTreeRank: " << n->subTreeRank << endl; + cout << "color: " << ((n->color==RED)?"RED":"BLACK") << endl; + cout << "parent: " << n->getParent() << endl; + cout << "left: " << n->getLeft() << endl; + cout << "right:" << n->getRight() << endl << endl; + +} + + +int BVTree::getTreeMaxDepth(){ + return getNodeMaxDepth(root); +} + +int BVTree::getTreeMinDepth(){ + return getNodeMinDepth(root); +} + + +void BVTree::updateCounters(BVNode *n){ + + if (n == getNil()) return; + + ulong lR = 0; + ulong lP = 0; + ulong rR = 0; + ulong rP = 0; + + if (n->getLeft() != getNil()) { + lR = n->getLeft()->subTreeRank; + lP = n->getLeft()->subTreePositions; + } + + if (n->getRight() != getNil()) { + rR = n->getRight()->subTreeRank; + rP = n->getRight()->subTreePositions; + } + + n->subTreeRank =lR + rR + n->myRank; + n->subTreePositions=lP + rP + n->myPositions; + +} + +ulong BVTree::getLocalRank(BVNode* n, ulong position){ + + #ifndef NDEBUG + if (position > n->myPositions) { + cerr << "error: getLocalRank: invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + + *tempbit =*(n->block)<<((2*logn)-position); + return tempbit->count(); + + // old version: + //rank = 0; + //for (ulong i = 0; i < position; i++) { + // if ((*n->block)[i]) rank++; + //} +} + +ulong BVTree::getLocalSelect1(BVNode* n, ulong query){ + ulong select =0; + #ifndef NDEBUG + if (query > n->myPositions) { + cerr << "error: getLocalSelect1: invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + ulong i; + for (i = 0; select < query; i++) { // TODO is there a faster solution similar to rank ? + if ((*n->block)[i]) select++; + } + + return i; +} + +ulong BVTree::getLocalSelect0(BVNode* n, ulong query){ + ulong select =0; + #ifndef NDEBUG + if (query > n->myPositions) { + cerr << "error: getLocalSelect0: invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + ulong i; + for (i = 0; select < query; i++) { + if (!(*n->block)[i]) select++; + } + + return i; +} + + +void BVTree::printNode(ulong i){ + BVNode* x = getRoot(); + + #ifndef NDEBUG + if (x == getNil()) { + cerr << "error: printNode(int i): root=NULL.\n"; + exit(EXIT_FAILURE); + + } + + if (i > x->myPositions) { + cerr << "error: printNode(int i): invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + + ulong lP=0; + bool search = true; + //find the corresponding block: + while (search) + { + + lP = x->getLeft()->subTreePositions; + + + if (lP >= i) + { + x=x->getLeft(); + } + else if (lP+x->myPositions >= i){ + i-=lP; + search = false; + } + else{ + i-=(lP+x->myPositions); + x=x->getRight(); + } + } + + + cout << "i=" << i << endl; + printNode(x); +} + + +bool BVTree::operator[](ulong i){ + + BVNode* x = getRoot(); + ulong lsP=0; + bool search = true; + //find the corresponding block: + while (search) + { + + lsP = x->getLeft()->subTreePositions; + + if (lsP >= i) + { + #ifndef NDEBUG + if (x->getLeft()==getNil()) { + cout << "lsP: " << lsP << endl; + printNode(x); + cout << "ihhh" << endl; + checkTree(); + exit(EXIT_FAILURE); + } + #endif + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i){ + i-=lsP; + search = false; + } + else{ + i-=(lsP+x->myPositions); + #ifndef NDEBUG + if (x->getRight()==getNil()) { + cout << "i: " << i << endl; + cout << "lsP: " << lsP << endl; + cout << "x->myPositions: " << x->myPositions << endl; + printNode(x); + cout << "ihhh" << endl; + checkTree(); + exit(EXIT_FAILURE); + } + #endif + x=x->getRight(); + } + } + + return (*x->block)[i-1]; +} + + +ulong BVTree::rank1(ulong i){ + BVNode* x = getRoot(); + + if (i == this->getPositions() + 1) i--; + #ifndef NDEBUG + if (i > this->getPositions() ) { + cerr << "error: rank1(0): invalid position in bittree: " << i << endl; + cerr << "error: rank1(0): this->getPositions(): " << this->getPositions()<< endl; + exit(EXIT_FAILURE); + } + #endif + + + ulong lsP=0; + ulong lsR=0; + ulong rank=0; + bool search = true; + //find the corresponding block: + while (search) + { + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + + if (lsP >= i) + {// cout << "L" << endl; + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i){ + i-=lsP; + rank+=lsR; + search = false; + } + else{//cout << "R" << endl; + i-=(lsP+x->myPositions); + rank+=(lsR+x->myRank); + x=x->getRight(); + } + } + + rank+=getLocalRank(x, i); + return rank; +} + +ulong BVTree::rank0(ulong i){ + if (this->getPositions() == 0) return 0; + return (i-rank1(i)); +} + +ulong BVTree::select1(ulong i){ + BVNode* x = getRoot(); + ulong select=0; + #ifndef NDEBUG + if (i > x->subTreeRank ) { + cerr << "error: select1: invalid position in bittree: " << i << endl; + exit(EXIT_FAILURE); + } + #endif + + ulong lsP=0; + ulong lsR=0; + bool search = true; + //find the corresponding block: + while (search) + { + // update subTree-counters + + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + + if (lsR >= i) { + x=x->getLeft(); + } + else if (lsR+x->myRank >= i) { + i-=lsR; + select+=lsP; + search = false; + } + else { + i-=(lsR+x->myRank); + select+=(lsP+x->myPositions); + x=x->getRight(); + } + } + select+=getLocalSelect1(x, i); + + + + return select; +} + +ulong BVTree::select0(ulong i){ + BVNode* x = getRoot(); + ulong select=0; + + #ifndef NDEBUG + if (i > (x->subTreePositions - x->subTreeRank) || i < 0) { + cerr << "error: select1: invalid position in bittree: " << i << endl; + exit(EXIT_FAILURE); + } + #endif + + + ulong lsP=0; + ulong lsR=0; + ulong lmR=0; + ulong lmP=0; + bool search = true; + //find the corresponding block: + while (search) + { + // update subTree-counters + + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + lmR = x->getLeft()->myRank; + lmP = x->getLeft()->myPositions; + + if (lsP-lsR >= i) + { + x=x->getLeft(); + } + else if ((lsP-lsR)+(x->myPositions-x->myRank) >= i){ + i-=lsP-lsR; + select+=lsP; + search = false; + } + else{ + i-=((lsP-lsR)+(x->myPositions-x->myRank)); + select+=(lsP+x->myPositions); + x=x->getRight(); + } + } + + select+=getLocalSelect0(x, i); + + return select; +} + + +void BVTree::updateCountersOnPathToRoot(BVNode *walk){ + + while (walk != getNil()) { + updateCounters(walk); + walk=walk->getParent(); + } +} + +//deletes the BVNode and all its children, destroys reb-black-tree property! +void BVTree::deleteNode(BVNode *n){ + if (n==getNil() || n == 0) return; + + if (n->getLeft() != getNil() && n->getLeft()) deleteNode(n->getLeft()); + if (n->getRight() != getNil() && n->getRight()) deleteNode(n->getRight()); + + delete n; +} + + + + +// TODO improve by returning bitvalue + +void BVTree::deleteBit(ulong i){ + ulong old_i = i; + BVNode* x = getRoot(); + bool bit; + ulong rank=0; + + #ifndef NDEBUG + if (x == getNil()) { + cerr << "error: deleteBit, root is empty\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + + if (i > x->subTreePositions || i < 1) + { + cerr << "error: A, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + + ulong lsP=0; + ulong lsR=0; + + bool search = true; + //find the corresponding block: + while (search) + { + // update of pointers is not yet possible: call updateCountersOnPathToRoot + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + + + if (lsP >= i) + { + #ifndef NDEBUG + if (x->getLeft()==getNil()) exit(EXIT_FAILURE); + #endif + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i){ + i-=lsP; + rank+=lsR; + search = false; + } + else{ + i-=(lsP+x->myPositions); + rank+=(lsR+x->myRank); // for speedup! + #ifndef NDEBUG + if (x->getRight()==getNil()) exit(EXIT_FAILURE); + #endif + x=x->getRight(); + } + } + + + + // now delete the bit from the block x: + bit =(*x->block)[i-1]; + + // store bit and rank information for speedup + lastBitDeleted=bit; + rank+=getLocalRank(x, i); + lastRank=(bit?rank:old_i-rank); + + #ifndef NDEBUG + if (i > x->myPositions) { + cerr << "error: B, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + bitset<2*logn> mask; + + + if ( i > 1 ) + { + mask.set(); + mask>>=(2*logn - i + 1); + mask &= *(x->block); + (*(x->block)>>=i)<<=i-1; // shift bits by one + (*x->block)|=mask; // restore bits in front of deleted bit + } else { + *(x->block)>>=1; // shift bits by one + } + + x->myPositions--; + if (bit) x->myRank--; + + updateCountersOnPathToRoot(x); + + + if (x->myPositions == 0){ // if merging was not possible: + + rbDelete(x, callUpdateCountersOnPathToRoot); + delete x; + + } else if (x->myPositions <= logn/2) // merge (and rotate), if necessary: + { + + BVNode *sibling = (BVNode*) treeSuccessor(x); + //cout << "try to merge -----------------------------------------" << endl; + if (sibling != getNil()) + { + if (x->myPositions + sibling->myPositions < 2*logn) //merge ! + { + //cout << "merge with right sibling -----------------------------------------" << endl; + // move bits from sibling to x: + + (*sibling->block)<<=x->myPositions; + (*x->block)|=(*sibling->block); + + x->myPositions+=sibling->myPositions; + x->myRank+=sibling->myRank; + updateCountersOnPathToRoot(x); + rbDelete(sibling, callUpdateCountersOnPathToRoot); + delete sibling; + } + else sibling = getNil(); //to try the left sibling! + } + else if ((sibling= (BVNode*) treePredecessor(x)) != getNil()) + { + if (x->myPositions + sibling->myPositions < 2*logn) //merge ! + { + // move bits from sibling to x: (more complicated, needs shift!) + //cout << "merge with left sibling -----------------------------------------" << endl; + (*x->block)<<=sibling->myPositions; + x->myPositions+=sibling->myPositions; + + (*x->block)|=(*sibling->block); + + x->myRank+=sibling->myRank; + updateCountersOnPathToRoot(x); + rbDelete(sibling, callUpdateCountersOnPathToRoot); + delete sibling; + } + + + + } // end else if + } // end else if + + + +} + +void BVTree::writeTree(char *writefile){ + std::ofstream write(writefile); + if (!write) + { + cerr << "error: Error writing file: " << writefile<< "." << endl; + exit(EXIT_FAILURE); + } + writeTree(write); +} + + +ulong* BVTree::getBits(){ + BVNode *n = getRoot(); + ulong blockCounter=0; + ulong len = getPositions(); + + int W = sizeof(ulong)*8; + + + ulong bitsLength = len/W + ((len%W==0)?0:1); + + ulong *bits = new ulong[bitsLength]; + + ulong i=0; //0 .. bitsLength-1 + int j=0; //0 .. W-1 + + ulong mask_LeftBitSet = 1; + + mask_LeftBitSet <<= W-1; + + n=(BVNode*) treeMinimum(root); + while (n != getNil()) { + #ifndef NDEBUG + if (n->block == 0) { + cerr << "getBits(): block not found !!!" << endl; + exit(EXIT_FAILURE); + } + #endif + for (blockCounter=0; blockCounter < n->myPositions; blockCounter++) { + if (j == W) { + i++; + j=0; + } + bits[i] >>= 1; // set 0 + + if ((*n->block)[blockCounter]) bits[i] |= mask_LeftBitSet; // set 1 + j++; + } + n=(BVNode*) treeSuccessor(n); + } + + if (i != bitsLength-1) { + cout << "last index is wrong" << endl; + exit(EXIT_FAILURE); + } + while (j < W) { + bits[i] >>= 1; + j++; + } + + return bits; +} + +void BVTree::writeTree(ostream& stream){ + BVNode *x; + char c=0; + ulong cCounter=0; + ulong blockCounter=0; + + x = (BVNode*) treeMinimum(getRoot()); + + while (x != getNil()){ + blockCounter=x->myPositions; + while (blockCounter > 0) + { + while (blockCounter > 0 && cCounter < 8 ) + { + c<<=1; + if ((*x->block)[x->myPositions - blockCounter]) c++; + cCounter++; + blockCounter--; + } + if (cCounter == 8) + { + stream << c; + cCounter = 0; + } + } + + + x = (BVNode*) treeSuccessor(x); + } + + if (cCounter != 0) { + while (cCounter < 8 ) + { + c<<=1; // fill with zeros. + cCounter++; + } + stream << c; + } +} + + +void BVTree::appendBit(bool bit){ + ulong pos = 1; + if (root != getNil()) pos= getRoot()->subTreePositions + 1; + + insertBit(bit, pos); +} + +void BVTree::insertBit(bool bit, ulong i){ + if (getRoot() == getNil()) + { + BVNode *newNode; + newNode = new BVNode(getNil()); + newNode->color=BLACK; + bitset<2*logn> *newBlock; + newBlock=new bitset<2*logn>; + newNode->block = newBlock; + setRoot(newNode); + + } + + BVNode* x = getRoot(); + + #ifndef NDEBUG + if (i > x->subTreePositions+1) + { + printNode(x); + cerr << "error: insertBit: position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + if (i < 1) + { + cerr << "error: insertBit: position " << i <<" is not valid.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + + ulong lsP=0; + + + while (true) + { + // update subTree-counters + (x->subTreePositions)++; + if (bit) (x->subTreeRank)++; + + lsP = x->getLeft()->subTreePositions; + + if (lsP >= i) + {// cout << "A" << endl; + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i-1){ //-1 to append to last position in current node + i-=lsP; + break; + } + else{ + i-=(lsP+x->myPositions); + x=x->getRight(); + } + } + + // now put the bit into the block x: + bitset<2*logn> mask; + + if (x->myPositions > 0) + { + if (i != 1) + { + mask.set(); + mask>>=2*logn - i +1; // make 1's at all positions < i + mask &= *(x->block); // store bits in front of new bit + (*(x->block)>>=i-1)<<=i; // shift bits by one + (*x->block)|=mask; // restore bits in front of new bit + } + else + (*x->block)<<=1; + } + + (*x->block)[i-1]=bit; // set new bit + + // update local pointers + (x->myPositions)++; //update p (leaf) + if (bit) (x->myRank)++; //update r (leaf) + + #ifndef NDEBUG + if ((int)x->myPositions > 2*logn) + { + cerr << "error: positions in block already too many.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + + // split and rotate, if necessary + if ((int)x->myPositions == 2*logn) + { + //cout << "split !-------------" << endl; + + // new node: + BVNode *newNode; + newNode = new BVNode(getNil()); + + //find place for new node: + BVNode *y;// some parent of new node + if (x->getRight() != getNil()) { + y = (BVNode*) this->treeMinimum(x->getRight()); + y->setLeft(newNode); + } else { + y=x; + y->setRight(newNode); + } + newNode->setParent(y); + + //new block: + bitset<2*logn> *newBlock; + newBlock=new bitset<2*logn>; + *newBlock=*x->block>>logn; //copy of bits into the new block + + mask.set(); + mask>>=logn; + *x->block &= mask; //delete bits that already have been copied to the new block + + newNode->block=newBlock; + newNode->myRank=(*newNode->block).count(); + newNode->myPositions=logn; + newNode->color=RED; + + //update old node x: + x->myRank-=newNode->myRank; //=(*x->block).count(); + x->myPositions=logn; + + updateCountersOnPathToRoot(newNode); // meets x + rbInsertFixup(newNode, callUpdateCounters); + + } // end if + + +} + +void BVTree::checkSubTree(BVNode *n){ + ulong lP = 0; + ulong lR = 0; + ulong rP = 0; + ulong rR = 0; + + if (n->getLeft()!=getNil()) { + lP = n->getLeft()->subTreePositions; + lR = n->getLeft()->subTreeRank; + if (n->getLeft()->getParent() != n) { + cout << "au"<< endl; + exit(1); + } + + } + + if (n->getRight()!=getNil()) { + rP = n->getRight()->subTreePositions; + rR = n->getRight()->subTreeRank; + if (n->getRight()->getParent() != n) { + cout << "au"<< endl; + exit(1); + } + } + + if ( (n->subTreePositions != (n->myPositions + lP + rP)) || + (n->subTreeRank != (n->myRank + lR + rR)) ){ + cout << "checkSubTree: error" <myPositions + lP + rP: " << n->myPositions + lP + rP << endl; + cout << "n->myRank + lR + rR: " << n->myRank + lR + rR << endl; + printNode(n); + printNode(n->getLeft()); + printNode(n->getRight()); + exit(1); + } + + if (n->getLeft()!=getNil()) checkSubTree(n->getLeft()); + if (n->getRight()!=getNil()) checkSubTree(n->getRight()); +} + +void callUpdateCounters(RBNode *n, RBTree *T){ + ((BVTree*)T)->updateCounters((BVNode*) n); +} + +void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){ + ((BVTree*)T)->updateCountersOnPathToRoot((BVNode*) n); +} + diff --git a/bittree.h b/bittree.h new file mode 100644 index 0000000..5344f74 --- /dev/null +++ b/bittree.h @@ -0,0 +1,227 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +// Implementation of the Dynamic Bit Vector with Indels problem +// space: O(n) time: O(log(n)) +// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text +// Indexes. CPM 2006, Chapter 3.4 Dynamic Structures for Bit Vectors +// also: W.-L. Chan, W.-K. Hon, and T.-W. Lam. Compressed index for a dynamic collection +// of texts. In Proc. CPM04, LNCS 3109, pages 445-456, 2004 + +#ifndef BVTree +#define BVTree BVTree + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rbtree.h" + + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + + +#ifndef LOGN +#define LOGN 64 +#endif + +const int logn = LOGN; + +//upperBound = 2 * logn; +//lowerBound = logn / 2; + + +class BVNode; +class BVTree; + +void callUpdateCounters(RBNode *n, RBTree *T); +void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T); + +class BVNode : public RBNode +{ + public: + ulong myPositions; // 4*4 bytes = 16 bytes + ulong myRank; + ulong subTreePositions; //number of positions stored in the subtree rooted at this node + ulong subTreeRank; //number of bits set in the subtree rooted at this node + + std::bitset<2*logn> *block; // 4 bytes + + + BVNode() + : RBNode(this), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) { + } + + + BVNode(BVNode* n) + : RBNode(n), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) { + } + + ~BVNode(){ + delete block; + block = 0; + } + + + BVNode* getParent(){ + return ((BVNode*) ((RBNode*) this)->parent); + } + + BVNode* getLeft(){ + return ((BVNode*) ((RBNode*) this)->left); + } + + BVNode* getRight(){ + return ((BVNode*) ((RBNode*) this)->right); + } + + void setParent(BVNode* n){ + ((RBNode*) this)->parent=(RBNode*)n; + } + + void setLeft(BVNode* n){ + ((RBNode*) this)->left=(RBNode*)n; + } + + void setRight(BVNode* n){ + ((RBNode*) this)->right=(RBNode*)n; + } + +}; + + +class BVTree : public RBTree{ +public: + + //Constructors + BVTree(){ + + setNil(new BVNode()); + setRoot(getNil()); + + tempnil = getNil(); + + tempbit = new std::bitset<2*logn>; + } + + //Destructor: + ~BVTree(); + + bool operator[](ulong); + + + // inserts bit at position i, countings begins with 1: + void appendBit(bool bit); + void insertBit(bool bit, ulong i); + void deleteBit(ulong i); + + ulong rank0(ulong i); + ulong rank1(ulong i); + ulong rank(bool b, ulong i){return b?rank1(i):rank0(i);} + + ulong select0(ulong i); + ulong select1(ulong i); + ulong select(bool b, ulong i){return b?select1(i):select0(i);} + + void setRoot(BVNode* n){ + ((RBTree*) this)->root=(RBNode*)n; + } + + BVNode* getRoot(){ + return ((BVNode*) ((RBTree*) this)->root); + } + + void setNil(BVNode* n){ + tempnil = n; + ((RBTree*) this)->nil=(RBNode*)n; + } + + BVNode* getNil(){ + return tempnil; + + } + + // write bits back into a stream: + ulong* getBits(); + void writeTree(char *writefile); + void writeTree(std::ostream& stream); //e.g. stream = cout + + int getTreeMaxDepth(); + int getTreeMinDepth(); + ulong getPositions(); + ulong getRank(); + + void iterateReset(); + bool iterateGetBit(); + bool iterateNext(); + ulong iterateGetRank(bool bit); + + bool getLastBitDeleted(){return lastBitDeleted;} + ulong getLastRank(){return lastRank;} + + void checkSubTree(BVNode *n); + + void updateCounters(BVNode *n); + void updateCountersOnPathToRoot(BVNode *walk); + + //debug: + void printNode(ulong i); + +protected: + + ulong iterate; + ulong iterateLocal; + ulong iterateRank; + BVNode *iterateNode; + + BVNode *tempnil; + + bool lastBitDeleted; + ulong lastRank; + + + std::bitset<2*logn> *tempbit; + + // content of BVNode, for debugging: + void printNode(BVNode *n); + + // other operations: + ulong getLocalRank(BVNode* n, ulong position); + ulong getLocalSelect0(BVNode* n, ulong query); + ulong getLocalSelect1(BVNode* n, ulong query); + + void deleteNode(BVNode *n); + void deleteLeaf(BVNode *leaf); +}; + + +#endif + diff --git a/dependencies.mk b/dependencies.mk new file mode 100644 index 0000000..4e40bc8 --- /dev/null +++ b/dependencies.mk @@ -0,0 +1,12 @@ +BitRank.o: BitRank.cpp BitRank.h Tools.h +CSA.o: CSA.cpp CSA.h BitRank.h Tools.h dynFMI.h bittree.h rbtree.h \ + handle.h pos.h +TextCollection.o: TextCollection.cpp TextCollection.h Tools.h +Tools.o: Tools.cpp Tools.h +bittree.o: bittree.cpp bittree.h rbtree.h +dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h handle.h pos.h +handle.o: handle.cpp handle.h rbtree.h pos.h +pos.o: pos.cpp pos.h rbtree.h handle.h +rbtree.o: rbtree.cpp rbtree.h +testTextCollection.o: testTextCollection.cpp CSA.h BitRank.h Tools.h \ + dynFMI.h bittree.h rbtree.h handle.h pos.h TextCollection.h diff --git a/dynFMI.cpp b/dynFMI.cpp new file mode 100644 index 0000000..dd4e989 --- /dev/null +++ b/dynFMI.cpp @@ -0,0 +1,807 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + + +#include +#include +#include +#include +#include +#include +#include + +#include "dynFMI.h" + +using std::cerr; +using std::cout; +using std::endl; +using std::ios; +using std::priority_queue; +using std::vector; + + + +// -------- DynFMI -------- + +DynFMI::~DynFMI(){ + deleteDynFMINodes(root); + root = 0; + delete[] leaves; //free(leaves) // ???; +#if SAMPLE!=0 + delete SampledSAPositionsIndicator; +#endif + delete handle; + delete pos; +#if SAMPLE!=0 + delete sampledSATree; +#endif +} + +void DynFMI::deleteDynFMINodes(WaveletNode *n){ + if (n->right) deleteDynFMINodes(n->right); + if (n->left) deleteDynFMINodes(n->left); + delete n; +} + +DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){ + initEmptyDynFMI(text, x, n); + if (del) delete [] text; +} + +DynFMI::DynFMI(char *readfile){ + + std::ifstream file(readfile); + if (!file) + { + cerr << "error reading file: " << readfile << endl; + exit(EXIT_FAILURE); + } + + file.seekg(0, ios::end); + ulong n = file.tellg(); + file.seekg(0, ios::beg); + + uchar *text = new uchar[n]; + + char c; + ulong i=0; + + + while (file.get(c)) + { + text[i]=c; + i++; + } + file.close(); + + initEmptyDynFMI(text, 1, n); + + + delete text; + +} + +void DynFMI::iterateReset(){ + iterate = 1; + recursiveIterateResetWaveletNode(root); +} + +void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){ + w->bittree->iterateReset(); + + if (w->left) recursiveIterateResetWaveletNode(w->left); + if (w->right) recursiveIterateResetWaveletNode(w->right); +} + + +bool DynFMI::iterateNext(){ + iterate++; + return !(iterate > getSize()); +} + +uchar DynFMI::iterateGetSymbol(){ + + ulong i = iterate; + WaveletNode *walk = root; + bool bit; + + while (true) { + + bit = walk->bittree->iterateGetBit(); // TODO improve + i=walk->bittree->iterateGetRank(bit); + + walk->bittree->iterateNext(); + + + if (bit) { //bit = 1 + if (walk->right == 0) return walk->c1; + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) return walk->c0; + walk=walk->left; + } + + + } // end of while + +} + + +uchar* DynFMI::getBWT(){ + ulong n = root->bittree->getPositions(); + n++; + uchar *text = new uchar[n]; + + bool data=true; + // old slow version: + //for (ulong i=1; i <= root->bittree->getPositions(); i++) + // text[i-1]=(*this)[i]; + + ulong i = 0; + + iterateReset(); + + while (data) { + text[i] = iterateGetSymbol(); + data = iterateNext(); + i++; + } + text[n-1]=0; + + return text; +} + +std::ostream& DynFMI::getBWTStream(std::ostream& stream){ + ulong n = root->bittree->getPositions(); + n++; + //uchar *text = new uchar[n]; + + bool data=true; + // old slow version: + //for (ulong i=1; i <= root->bittree->getPositions(); i++) + // text[i-1]=(*this)[i]; + + ulong i = 0; + + iterateReset(); + + while (data) { + //text[i] = iterateGetSymbol(); + stream << iterateGetSymbol(); + data = iterateNext(); + i++; + } + //text[n-1]=0; + + return stream; +} + +//void DynFMI::printDynFMIContent(ostream& stream){ +// uchar c; +// for (ulong i=1; i<=getSize(); i++) +// { +// c =(*this)[i]; +// if (c==0) c= '#'; +// stream << c; +// } + + +void DynFMI::deleteLeaves(WaveletNode *node){ + bool leaf = true; + + if (node->left) { + // internal node + leaf = false; + deleteLeaves(node->left); + + } + if (node->right){ + leaf = false; + deleteLeaves(node->right); + } + + if (leaf) { + // is a leaf, delete it! + if (node->parent) { + if (node==node->parent->left) node->parent->left=0; + else node->parent->right=0; + } + delete node; + } +} + +void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){ + #ifndef NDEBUG + if (node == node->left) { + cout << "makeCodes: autsch" << endl; + exit(0); + } + #endif + + if (node->left) { + makeCodes(code | (0 << bits), bits+1, node->left); + makeCodes(code | (1 << bits), bits+1, node->right); + } else { + codes[node->c0] = code; + codelengths[node->c0] = bits+1; + } +} + +void DynFMI::appendBVTrees(WaveletNode *node){ + node->bittree = new BVTree(); + + if (node->left) appendBVTrees(node->left); + if (node->right) appendBVTrees(node->right); +} + +void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){ + // pointers to the leaves for select + leaves = (WaveletNode**) new WaveletNode*[256]; + for(int j=0; j<256; j++) leaves[j]=0; + + + ulong i; + for(i=0; i < n; i++) { + if (text[i]==0) { + cerr << "Error: unsigned char 0 is reserved." << endl; + exit(EXIT_FAILURE); + } + + if (leaves[text[i]]==0) { + leaves[text[i]] = new WaveletNode(text[i]); + } + leaves[text[i]]->weight++; + } + + // separation symbol: + leaves[0] = new WaveletNode((uchar)0); + leaves[0]->weight=x; + + // Veli's approach: + priority_queue< WaveletNode*, vector, std::greater > q; + + + for(int j=255; j>=0; j--){ + if (leaves[j]!=0) { + q.push( (leaves[j]) ); + } + codes[j] = 0; + codelengths[j] = 0; + } + + // creates huffman shape: + while (q.size() > 1) { + + WaveletNode *left = q.top(); + q.pop(); + + WaveletNode *right = q.top(); + q.pop(); + + q.push( new WaveletNode(left, right) ); + } + + + root = q.top(); + q.pop(); + + + makeCodes(0,0, root); // writes codes and codelengths + + + // merge leaves (one leaf represent two characters!) + for(int j=0; j<256; j++){ + + if (leaves[j]) { + + if (leaves[j]->parent->left==leaves[j]) { + leaves[j]->parent->c0=j; + } else { + leaves[j]->parent->c1=j; + } + leaves[j]=leaves[j]->parent; // merge + } + } + + deleteLeaves(root); + appendBVTrees(root); + + // array C needed for backwards search + for(int j=0; j<256+256; j++) C[j] = 0; + +#if SAMPLE!=0 + this->SampledSAPositionsIndicator = new BVTree(); + this->sampledSATree = new SampledSATree(); +#endif + + this->pos = new Pos(sampleInterval); + this->handle = new Handle(); + + pos->handle=this->handle; + handle->pos=this->pos; + +} + +void DynFMI::append(uchar *text, ulong length){ + ulong i; + ulong start = root->bittree->getPositions(); + for (i = start; ibittree->getPositions()+1); +} + +void DynFMI::insert(uchar c, ulong i){ + #ifndef NDEBUG + if (codelengths[c]==0) { + cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;; + exit(EXIT_FAILURE); + } + #endif + + ulong level = 0; + ulong code = codes[c]; + + bool bit; + WaveletNode *walk = root; + + while (walk) { + + bit = ((code & (1u << level)) != 0); + + walk->bittree->insertBit(bit,i); // TODO improve + i=walk->bittree->rank(bit, i); + + if (bit) { //bit = 1 + walk=walk->right; + } else { // bit = 0 + walk=walk->left; + } + + level++; + } // end of while + + int j = 256+c; + while(j>1) { + C[j]++; + j=binaryTree_parent(j); + } + C[j]++; + +} + +void DynFMI::deleteSymbol(ulong i){ + WaveletNode *walk = root; + bool bit; + uchar c; + while (true) { + + // original slow version: + //bit = (*walk->bittree)[i]; + //old_i = i; + //i=walk->bittree->rank(bit, i); + //walk->bittree->deleteBit(old_i); + + + walk->bittree->deleteBit(i); + bit=walk->bittree->getLastBitDeleted(); + i=walk->bittree->getLastRank(); + + if (bit) { //bit = 1 + if (walk->right == 0) { + c = walk->c1; + break; + } + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) { + c = walk->c0; + break; + } + walk=walk->left; + } + + + } // end of while + + int j = 256+c; + while(j>1) { + C[j]--; + j=binaryTree_parent(j); + } + C[j]--; + + +} + + +uchar DynFMI::operator[](ulong i){ + WaveletNode *walk = root; + bool bit; + + while (true) { + + bit = (*walk->bittree)[i]; //TODO improve by reducing + i=walk->bittree->rank(bit, i); + + if (bit) { //bit = 1 + if (walk->right == 0) return walk->c1; + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) return walk->c0; + walk=walk->left; + } + + + } // end of while + cout << endl; +} + +ulong DynFMI::rank(uchar c, ulong i){ + if (i==0) return 0; + + ulong level = 0; + ulong code = codes[c]; + + + bool bit; + WaveletNode *walk = root; + + while (true) { + + bit = ((code & (1u << level)) != 0); + + i=walk->bittree->rank(bit, i); + if (bit) { //bit = 1 + if (walk->right == 0) return i; + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) return i; + walk=walk->left; + } + + level++; + } // end of while + + cerr << "error: DynFMI::rank: left while loop" << endl; + exit(EXIT_FAILURE); + return 0; //never +} + +ulong DynFMI::select(uchar c, ulong i){ + + WaveletNode *walk = leaves[c]; + + bool bit = (walk->c1==c); + + while (walk->parent) { + i=walk->bittree->select(bit, i); + + bit = (walk == walk->parent->right); + walk=walk->parent; + } // end of while + + i=walk->bittree->select(bit, i); + + return i; +} + + +ulong DynFMI::count(uchar *pattern){ + + ulong i = strlen((char*)pattern); + ulong sp = 1; + + ulong ep = getSize(); + uchar s; + while ((sp <= ep) && (i >=1)) { + s=pattern[i-1]; + sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1; + ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep); + + i--; + } + return ep-sp +1; +} + +#if SAMPLE!=0 +ulong* DynFMI::locate(uchar *pattern){ + + ulong i = strlen((char*)pattern); + ulong sp = 1; + + ulong ep = getSize(); + uchar s; + while ((sp <= ep) && (i >=1)) { + s=pattern[i-1]; + sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1; + ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep); + + i--; + } + + ulong numberOfMatches = ep-sp +1; + ulong *matches = new ulong[2*numberOfMatches + 1]; + matches[0] = numberOfMatches; + + ulong diff; + ulong k = 1; + cout << "----------------------" << endl; + for (ulong j=sp; j <= ep; j++) { + + i=j; + diff = 0; + while (!(*SampledSAPositionsIndicator)[i]) { + diff++; + s=(*this)[i]; + i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i); + + } + + + sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i)); + + //cout << "found: (" << sampledSATree->handle << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl; + matches[k++] = sampledSATree->handle; + matches[k++] = sampledSATree->sampledSAValue + diff; + } + + return matches; +} +#endif + +// size must include endmarker! +ulong DynFMI::addText(uchar const * str, ulong n){ + ulong i; +#if SAMPLE!=0 + bool sample = false; +#endif + i=pos->getSize()+1; + + ulong key = pos->appendText(n); + + // Adding an empty text (end-marker) + if (n == 1) + { + insert(str[0], i); + //if (del) delete [] str; + return key; + } + + insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1 +#if SAMPLE!=0 + sample = (n-1 % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + for (ulong t=n-2; t > 0; t--) { + i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i); + insert(str[t-1],i); +#if SAMPLE!=0 + sample = ((t) % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + + if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + } + + + i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i); + insert(str[n-1],i); +#if SAMPLE!=0 + SampledSAPositionsIndicator->insertBit(true, i); + sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif +// if (del) delete [] str; + + return key; +} + + +ulong DynFMI::addTextFromFile(char* filename, ulong n){ + ulong i; +#if SAMPLE!=0 + bool sample = false; +#endif + i=pos->getSize()+1; + + + std::ifstream str(filename, ios::binary); + if (!str) + { + cerr << "error reading file: " << filename << endl; + exit(EXIT_FAILURE); + } + + char c; + //char oldc; + + + ulong buf_len; //1048576; //10MB + char * buffer; + ulong buf_pos; + + if (BUFFER==0) buf_len = n; + else buf_len = BUFFER; + + buffer = new char[buf_len]; + buf_pos = (n-2) / buf_len; + str.seekg(buf_pos*buf_len); + str.read(buffer, buf_len); + str.clear(std::ios_base::goodbit); + + + ulong key = pos->appendText(n); + + //str.seekg(n-2); + + //c=str.get(); + c=buffer[(n-2)%buf_len]; + insert(c,i); // insert second last character, corresponds to suffix of length 1 +#if SAMPLE!=0 + sample = (n-1 % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + if ((n-2)%buf_len == 0) { + c=buffer[0]; + buf_pos--; + str.seekg(buf_pos*buf_len); + str.read(buffer, buf_len); + } + for (ulong t=n-2; t > 0; t--) { + + i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i); + + c=buffer[(t-1)%buf_len]; + insert(c,i); + + //check buffer + if (((t-1)%buf_len == 0) && ((t-1)!=0)) { +#ifndef NDEBUG + if (buf_pos == 0) { + cerr << "buf_pos too small" << endl; + exit(0); + } +#endif + buf_pos--; + str.seekg(buf_pos*buf_len); + str.read(buffer, buf_len); + } + +#if SAMPLE!=0 + sample = ((t) % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + + if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + } + + i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); + insert(c,i); + +#if SAMPLE!=0 + SampledSAPositionsIndicator->insertBit(true, i); + sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + + str.close(); + + return key; +} + + +uchar* DynFMI::retrieveText(ulong key){ + cout << "key: " << key << endl; + uchar *text=0; + // TODO access two times, too much + ulong i=handle->getPos(key); //=bwtEnd; + if (i == 0 ) { + return text; + } + + ulong n=pos->getTextSize(i); + + text = new uchar[n]; // last byte 0 for cout + + uchar c; + +//TODO better + for (ulong t=n-2; t < n; t--) { + c = (*this)[i]; + text[t]=(c==0?'#':c); + + i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); + } + + #ifndef NDEBUG + for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-'; + #endif + + text[n-1] = 0; + return text; +} + + +bool DynFMI::deleteText(ulong key){ + // TODO access two times, can do better + ulong i=handle->getPos(key); //=bwtEnd; + if (i == 0) return false; + + ulong n=pos->getTextSize(i); + ulong *text = new ulong[n]; + uchar c; + + for (ulong t=n-1; t deleteSymbol(text[i]); +#if SAMPLE!=0 + SampledSAPositionsIndicator->deleteBit(text[i]); +//TODO samples ? +#endif + } + delete[] text; + handle->deleteKey(key); + + + return true; +} + +ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){ + int j = 256+c; + ulong r=0; + while(j>1) { + if (binaryTree_isRightChild(j)) + r += C[binaryTree_left(binaryTree_parent(j))]; + + j=binaryTree_parent(j); + } + return r; +} + + + + +void DynFMI::printDynFMIContent(std::ostream& stream){ + uchar c; + for (ulong i=1; i<=getSize(); i++) + { + c =(*this)[i]; + if (c==0) c= '#'; + stream << c; + } +} + + + + diff --git a/dynFMI.h b/dynFMI.h new file mode 100644 index 0000000..41ef992 --- /dev/null +++ b/dynFMI.h @@ -0,0 +1,233 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +// ------ Dynamic Sequence with Indels ----- +// uses huffman shaped wavelet tree +// space: O(nH(0)) operation time: O(log n log \sigma) +// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text +// Indexes. CPM 2006, Chapter 3.6 + + +#include +#include +#include +#include +#include + +#ifndef SAMPLE +//#define SAMPLE 1024 +#define SAMPLE 0 // No sampling. +#endif + +#ifndef BUFFER +#define BUFFER 1048576 +#endif + + +#include "bittree.h" +#include "handle.h" +#include "pos.h" + +#if SAMPLE!=0 +#include "ssatree.h" +#endif + + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + + +const ulong sampleInterval = SAMPLE; + +class WaveletNode{ + public: + + WaveletNode *left; + WaveletNode *right; + WaveletNode *parent; + ulong weight; // used only while construction + uchar c0; // used also while construction + uchar c1; + + BVTree *bittree; + + WaveletNode(uchar c) + : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){} + + WaveletNode(WaveletNode *left, WaveletNode *right) + : left(left), right(right), parent(0), bittree(0) { + weight = left->weight + right->weight; + left->parent = this; + right->parent = this; + } + + ~WaveletNode(){ + delete bittree; + bittree = 0; + } + + bool operator>(const WaveletNode &a) const { + return (weight > a.weight); + } + + +}; + +namespace std +{ + +template<> struct greater + { + bool operator()(WaveletNode const* p1, WaveletNode const* p2) + { + if(!p1) + return false; + if(!p2) + return true; + return *p1 > *p2; + } + }; +} + + +class DynFMI{ + public: + + // read char distribution from string + // x=expected number of texts, n=length of text, del=delete text afterwards + DynFMI(uchar *text, ulong x, ulong n, bool del); + // read char distribution from file + DynFMI(char *readfile); + + ~DynFMI(); + + + + //n=length of str, del=delete text afterwards + ulong addText(uchar const * str, ulong n); + ulong addTextFromFile(char* filename, ulong n); + uchar* retrieveText(ulong text); + bool deleteText(ulong key); + + ulong count(uchar *pattern); +#if SAMPLE!=0 + ulong* locate(uchar *pattern); +#endif + + //LF(i)-mapping: C[L[i]]+rank_L[i](L,i) + ulong LFmapping(ulong i) { + uchar s=(*this)[i]; + return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i); + } + + + ulong getSize() {return root->bittree->getPositions();} + ulong getCollectionSize(){ return pos->getSize();} + uchar* getBWT(); + std::ostream& getBWTStream(std::ostream& stream); + + uchar operator[](ulong i); + void printDynFMIContent(std::ostream& stream); + ulong* getKeys() {return handle->getKeys(); } + ulong getPos(ulong i) + { return handle->getPos(i); } + + private: + WaveletNode *root; + WaveletNode **leaves; // needed for construction and select + + ulong codes[256]; + int codelengths[256]; + ulong C[256+256]; + +#if SAMPLE!=0 + BVTree *SampledSAPositionsIndicator; +#endif + ulong iterate; + + Handle *handle; + Pos *pos; +#if SAMPLE!=0 + SampledSATree *sampledSATree; +#endif + + + ulong rank(uchar c, ulong i); + ulong select(uchar c, ulong i); + + void insert(uchar c, ulong i); + void append(uchar c); + void append(uchar *text, ulong length); + void deleteSymbol(ulong i); + + + + // functions + ulong getNumberOfSymbolsSmallerThan(uchar c); + + void initEmptyDynFMI(uchar *text, ulong x, ulong n); + + void makeCodes(ulong code, int bits, WaveletNode *node); + void deleteLeaves(WaveletNode *node); + void appendBVTrees(WaveletNode *node); + + void deleteDynFMINodes(WaveletNode *n); + + //Iterator (public??) + void iterateReset(); + bool iterateNext(); + uchar iterateGetSymbol(); + void recursiveIterateResetWaveletNode(WaveletNode *w); + + + // small help functions + double log2(double x){ + return (log10(x) / log10((double)2)); + } + + int binaryTree_parent(int i){ + return (int)floor((double)i/2); + } + + int binaryTree_left(int i){ + return 2*i; + } + + int binaryTree_right(int i){ + return 2*i + 1; + } + + bool binaryTree_isLeftChild(int i){ + return (i%2==(int)0); + } + + bool binaryTree_isRightChild(int i){ + return (i%2==(int)1); + } + +}; + + + + diff --git a/handle.cpp b/handle.cpp new file mode 100644 index 0000000..9568dd4 --- /dev/null +++ b/handle.cpp @@ -0,0 +1,288 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +#include +#include + +#include "handle.h" + +using std::cerr; +using std::cout; +using std::endl; + +ulong Handle::getPos(ulong key){ + HandleNode *n = getKey(key); + if (n==0) return 0; + + PosNode *p = n->posNode; + + #ifndef NDEBUG + if (p==pos->getNil()) { + cerr << "error: Handle::getPos: no PosNode found." << endl; + exit(EXIT_FAILURE); + } + #endif + return (this->pos->getPos(p)); +} + +ulong* Handle::getEndPositions(){ + ulong num = getSize(); + ulong i = 0; + + if (this->root == getNil()) return 0; + + ulong *positions= new ulong[num]; + + + HandleNode *n = (HandleNode*) treeMinimum(root); + while (n != getNil()) { + + //printf("node: %d", n); + positions[i] = this->pos->getPos(n->posNode); + i++; + n = (HandleNode*) treeSuccessor((RBNode*) n); + } + return positions; +} + + +ulong* Handle::getKeys(){ + ulong num = getSize(); + ulong i = 0; + + if (this->root == getNil()) return 0; + + ulong *keys= new ulong[num]; + + + HandleNode *n = (HandleNode*) treeMinimum(root); + #ifndef NDEBUG + HandleNode *oldn; + #endif + while (n != getNil()) { + + //printf("node: %d", n); + keys[i] = n->key; + i++; + #ifndef NDEBUG + oldn = n; + #endif + n = (HandleNode*) treeSuccessor((RBNode*) n); + #ifndef NDEBUG + if (n == oldn) { + cerr << "old!" << endl; + exit(EXIT_FAILURE); + } + #endif + } + #ifndef NDEBUG + if (num != i) { + cerr << "num: " << num << endl; + cerr << "i: " << i << endl; + cerr << "error: Handle::getKeys: key number was not correct." << endl; + exit(EXIT_FAILURE); + } + #endif + return keys; +} + +void Handle::deleteKey(ulong key){ + + HandleNode *n = getKey(key); + this->pos->deletePosNode(n->posNode); + + rbDelete( n, callHandleUpdateCountersOnPathToRoot); + delete n; +} + + + + +HandleNode* Handle::getKey(ulong key){ + HandleNode *n= getRoot(); + while (n != getNil()) { + if (n->key == key) return n; + if (n->getLeft() != getNil()) { + if (key <= n->getLeft()->maxkey) n=n->getLeft(); + else n=n->getRight(); + } + else n=n->getRight(); + } + + + return 0; +} + +void Handle::updateCountersOnPathToRoot(HandleNode *n) { + while (n != getNil()) { + updateCounters(n); + + n = n->getParent(); + } +} + + +void Handle::updateCounters(HandleNode *n) { + #ifndef NDEBUG + if (n == getNil()) { + cerr << "error: Handle::updateCounters" << endl; + exit(EXIT_FAILURE); + } + #endif + n->subtreeSize=1; + + + n->subtreeSize += n->getLeft()->subtreeSize; + + + if ( n->getRight() != getNil()) { + n->subtreeSize += n->getRight()->subtreeSize; + n->maxkey=n->getRight()->maxkey; + } else n->maxkey=n->key; + +} + + +HandleNode* Handle::getNewKey(){ + HandleNode *n= getRoot(); + + if (n == getNil()) { + //tree is empty + HandleNode *newLeaf= new HandleNode(getNil(),1); // 1=smallest key, 0 = error + setRoot(newLeaf); + ((RBNode*)newLeaf)->color=BLACK; + + return newLeaf; + } + + if (n->maxkey == n->subtreeSize) { + // tree is full + HandleNode *last = (HandleNode*) treeMaximum(n); + + + HandleNode *newLeaf= new HandleNode(getNil(), n->maxkey+1); + + last->setRight(newLeaf); + newLeaf->setParent(last); + + if (newLeaf->getParent() != getNil()) updateCountersOnPathToRoot(newLeaf->getParent()); + + rbInsertFixup(newLeaf, callHandleUpdateCounters); + + return newLeaf; + } + + HandleNode *newNode; + ulong smallerkeys = 0; + ulong lmax; //getLeft()->maxkey + ulong lsub; //n->getLeft()->subtreeSize + while (true) { + cout << "search first free key" << endl; + // search first free key + + lmax = n->getLeft()->maxkey; + lsub = n->getLeft()->subtreeSize; + + + if (lmax == 0) { // no left child + if (smallerkeys+1 < n->key) { // check if it is free + newNode= new HandleNode(getNil(), smallerkeys+1); + newNode->setParent(n); + n->setLeft(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + } + } else { //left child exists + if ( lmax > (lsub + smallerkeys) ) { + // free key at left subtree + n=n->getLeft(); + continue; + } else if (lmax + 1 < n->key) { // full left subtree, check if it is free inbetween + // insert to predecessor + n=(HandleNode*)treePredecessor(n); + //found place -> :predeccessor new key = lmax + 1 + newNode= new HandleNode(getNil(), lmax+1); + newNode->setParent(n); + n->setRight(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + + } + } + + smallerkeys += 1+lsub; + + if (n->getRight() == getNil()) { // no right child, must be free + newNode= new HandleNode(getNil(), smallerkeys+1); + newNode->setParent(n); + n->setRight(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + } else { //right child exists + + //n=n->getRight(); + if (n->getRight()->maxkey-smallerkeys == n->getRight()->subtreeSize) { // right subTree is full, insert in front + ulong leftMinKey = n->getRight()->maxkey - n->getRight()->subtreeSize; + if (leftMinKey -1 > n->key) { //in front there is space + //insert new n-key + 1 + newNode= new HandleNode(getNil(), n->key+1); + n=(HandleNode*)treeSuccessor(n); + newNode->setParent(n); + n->setLeft(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + } else { + cerr << "error: Handle::getNewKey: no space ?!? " << endl; + exit(EXIT_FAILURE); + } + } else { + n=n->getRight(); + } + + + } + + #ifndef NDEBUG + if (n==getNil()) { + cerr << "error: Handle::getNewKey: (A) something wrong ! " << endl; + exit(EXIT_FAILURE); + } + #endif + } + #ifndef NDEBUG + cerr << "error: Handle::getNewKey: (B) something wrong ! " << endl; + exit(EXIT_FAILURE); + #endif + return 0; // error +} + +void callHandleUpdateCounters(RBNode *n, RBTree *T){ + ((Handle *)T)->updateCounters((HandleNode*) n); +} + +void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){ + ((Handle *)T)->updateCountersOnPathToRoot((HandleNode*) n); +} + diff --git a/handle.h b/handle.h new file mode 100644 index 0000000..dbe2fc4 --- /dev/null +++ b/handle.h @@ -0,0 +1,134 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +#ifndef Handle +#define Handle Handle + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +#include "rbtree.h" +#include "pos.h" + + +class Pos; +class PosNode; + +void callHandleUpdateCounters(RBNode *n, RBTree *T); +void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T); + + +class HandleNode : public RBNode { + public: + ulong key; //maximum for internal nodes + ulong subtreeSize; + ulong maxkey; + PosNode *posNode; + + HandleNode() + : RBNode(this), key(0), subtreeSize(0), maxkey(0){ + } + + + HandleNode(HandleNode *n, ulong key) + : RBNode(n), key(key), subtreeSize(1), maxkey(key){ + } + + + HandleNode* getParent(){ + return ((HandleNode*) ((RBNode*) this)->parent); + } + + HandleNode* getLeft(){ + return ((HandleNode*) ((RBNode*) this)->left); + } + + HandleNode* getRight(){ + return ((HandleNode*) ((RBNode*) this)->right); + } + + void setParent(HandleNode* n){ + ((RBNode*) this)->parent=(RBNode*)n; + } + + void setLeft(HandleNode* n){ + ((RBNode*) this)->left=(RBNode*)n; + } + + void setRight(HandleNode* n){ + ((RBNode*) this)->right=(RBNode*)n; + } + +}; + +class Handle : public RBTree{ + public: + Pos *pos; + + Handle(){ + setNil(new HandleNode()); + setRoot(getNil()); + } + + + void setRoot(HandleNode* n){ + ((RBTree*) this)->root=(RBNode*)n; + } + + HandleNode* getRoot(){ + return ((HandleNode*) ((RBTree*) this)->root); + } + + void setNil(HandleNode* n){ + ((RBTree*) this)->nil=(RBNode*)n; + } + + HandleNode* getNil(){ + return ((HandleNode*) ((RBTree*) this)->nil); + } + + + ulong getSize(){ + return (getRoot()!=getNil())?getRoot()->subtreeSize:0; + } + + ulong* getKeys(); + + ulong getPos(ulong key); + ulong* getEndPositions(); + + HandleNode* getKey(ulong key); + void deleteKey(ulong key); + void updateCountersOnPathToRoot(HandleNode *n); + void updateCounters(HandleNode *n); + + HandleNode* getNewKey(); + + + +}; + +#endif diff --git a/makefile b/makefile new file mode 100644 index 0000000..0d90342 --- /dev/null +++ b/makefile @@ -0,0 +1,15 @@ +CC = g++ +CPPFLAGS = -Wall -ansi -pedantic -g + +testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o handle.o pos.o rbtree.o dynFMI.o + +testTextCollection: $(testTextCollection_obs) + $(CC) -o testTextCollection $(testTextCollection_obs) + +clean: + rm -f core *.o *~ testTextCollection + +depend: + $(CC) -MM *.cpp *.c > dependencies.mk + +include dependencies.mk diff --git a/pos.cpp b/pos.cpp new file mode 100644 index 0000000..45f2b15 --- /dev/null +++ b/pos.cpp @@ -0,0 +1,170 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modquity * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +#include +#include + +#include "pos.h" + +using std::cerr; +using std::endl; + +ulong Pos::getPos(PosNode *n){ + ulong position=0; + + if (n->getLeft() != getNil()) position += n->getLeft()->subtreeSize; + + while (n->getParent() != getNil()) { + if (n == n->getParent()->getRight()) { + // rightchild + if (n->getParent()->getLeft() != getNil()) position += n->getParent()->getLeft()->subtreeSize; + position++; + } + + n=n->getParent(); + } + + + return position+1; +} + +ulong Pos::getTextSize(ulong pos){ + PosNode *n= getPosNode(pos); + + return n->textSize; +} + +void Pos::deleteText(ulong pos){ + PosNode *n= getPosNode(pos); + + // current node matches. + rbDelete(n, callPosUpdateCountersOnPathToRoot); + delete n; +} + +void Pos::deletePosNode(PosNode *n){ + rbDelete(n, callPosUpdateCountersOnPathToRoot); + delete n; +} + +PosNode* Pos::getPosNode(ulong pos){ + // smallest pos is 1 ! + pos--; + PosNode *n= getRoot(); + ulong leftTree=0; + ulong leftSubTreeSize; + while (true) { + leftSubTreeSize = n->getLeft()->subtreeSize; + + if (pos == leftTree + leftSubTreeSize ) { + // current node matches. + return n; + } else if (pos < leftTree + leftSubTreeSize){ + n=n->getLeft(); + } + else { + leftTree += leftSubTreeSize +1; + n=n->getRight(); + } + } + + + cerr << "error: Pos::getPosNode: text POS " << pos << " not found!" << endl; + exit(EXIT_FAILURE); + + return getNil(); +} + + +ulong Pos::appendText(ulong textSize){ + PosNode *n= getRoot(); + + if (n == getNil()) { + PosNode *newLeaf= new PosNode(getNil(), textSize); + + setRoot(newLeaf); + ((RBNode*)newLeaf)->color = BLACK; + + HandleNode* hn = handle->getNewKey(); + + // connect handleNode and posNode: + hn->posNode = newLeaf; + newLeaf->handleNode = hn; + + return hn->key; + } + + while (n->getRight() != getNil()) { + n=n->getRight(); + } + + // new right child ! + PosNode *newLeaf= new PosNode(getNil(), textSize); + + + n->setRight(newLeaf); + newLeaf->setParent(n); + + updateCountersOnPathToRoot(newLeaf); + + rbInsertFixup(newLeaf, callPosUpdateCounters); + + HandleNode* hn = handle->getNewKey(); + + // connect handleNode and posNode: + hn->posNode = newLeaf; + newLeaf->handleNode = hn; + + return hn->key; +} + + + + +void Pos::updateCountersOnPathToRoot(PosNode *n) { + while (n != getNil()) { + updateCounters(n); + + n = n->getParent(); + } +} + +void Pos::updateCounters(PosNode *n) { + n->subtreeSize=1; + //n->sumTextSize = n->textSize; + + n->subtreeSize += n->getLeft()->subtreeSize; + //n->sumTextSize += n->getLeft()->sumTextSize; + + n->subtreeSize += n->getRight()->subtreeSize; + //n->sumTextSize += n->getRight()->sumTextSize; +} + + + +void callPosUpdateCounters(RBNode *n, RBTree *T){ + ((Pos *)T)->updateCounters((PosNode*) n); +} + +void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){ + ((Pos *)T)->updateCountersOnPathToRoot((PosNode*) n); +} + diff --git a/pos.h b/pos.h new file mode 100644 index 0000000..4fcdbb7 --- /dev/null +++ b/pos.h @@ -0,0 +1,139 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + + //TODO check star color of the nodes pos and handle! + +#ifndef Pos +#define Pos Pos + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +#include "rbtree.h" +#include "handle.h" + +class Handle; +class HandleNode; + +void callPosUpdateCounters(RBNode *n, RBTree *T); +void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T); + +class PosNode : public RBNode { + public: + + HandleNode *handleNode; + ulong subtreeSize; + ulong textSize; // size including endmarker! + //ulong sumTextSize; // sum of textlength of all texts located in this subtree; + + PosNode() // NIL + : RBNode(this),subtreeSize(0),textSize(0) { + } + + PosNode(PosNode *n, ulong textSize) + : RBNode(n),subtreeSize(1),textSize(textSize) { + } + + + PosNode* getParent(){ + return ((PosNode*) ((RBNode*) this)->parent); + } + + PosNode* getLeft(){ + return ((PosNode*) ((RBNode*) this)->left); + } + + PosNode* getRight(){ + return ((PosNode*) ((RBNode*) this)->right); + } + + void setParent(PosNode* n){ + ((RBNode*) this)->parent=(RBNode*)n; + } + + void setLeft(PosNode* n){ + ((RBNode*) this)->left=(RBNode*)n; + } + + void setRight(PosNode* n){ + ((RBNode*) this)->right=(RBNode*)n; + } +}; + +class Pos: public RBTree +{ + public: + + Handle* handle; + + ulong textNumber; + ulong matchPosition; + + ulong sampleInterval; + + Pos(ulong sampleInterval){ + setNil(new PosNode()); + setRoot(getNil()); + this->sampleInterval=sampleInterval; + } + + void setRoot(PosNode* n){ + ((RBTree*) this)->root=(RBNode*)n; + } + + PosNode* getRoot(){ + return ((PosNode*) ((RBTree*) this)->root); + } + + void setNil(PosNode* n){ + ((RBTree*) this)->nil=(RBNode*)n; + } + + PosNode* getNil(){ + return ((PosNode*) ((RBTree*) this)->nil); + } + + ulong getSize(){ + return (getRoot()!=getNil())?getRoot()->subtreeSize:0; + } + + + + ulong getPos(PosNode *n); + + + PosNode* getPosNode(ulong text); + ulong getTextSize(ulong pos); + void deleteText(ulong pos); + void deletePosNode(PosNode *n); + ulong appendText(ulong textSize); + + void updateCountersOnPathToRoot(PosNode *n); + void updateCounters(PosNode *n); + +}; + +#endif diff --git a/rbtree.cpp b/rbtree.cpp new file mode 100644 index 0000000..09326c1 --- /dev/null +++ b/rbtree.cpp @@ -0,0 +1,454 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +#include +#include +#include "rbtree.h" + + +using std::cout; +using std::endl; +using std::cerr; + + +void RBTree::checkTree(){ + cout << "start" << endl; + this->root->countBlack(0); + cout << "stop" << endl; +} + +void RBTree::leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode* y = x->right; + x->right=y->left; + if (y->left != nil) y->left->parent=x; + y->parent=x->parent; + if (x->parent==this->nil) + root=y; + else { + if (x == x->parent->left) + x->parent->left=y; + else + x->parent->right=y; + } + y->left=x; + x->parent=y; + + // update counters of x and y ! + if (updateNode != 0) { + updateNode(x, this); + updateNode(y, this); + } +} + +void RBTree::rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode* y = x->left; + x->left=y->right; + if (y->right != nil) y->right->parent=x; + y->parent=x->parent; + + + if (x->parent==nil) + root=y; + else { + if (x == x->parent->right) { + x->parent->right=y; + } else { + x->parent->left=y; + } + } + + y->right=x; + x->parent=y; + if (updateNode != 0) { + updateNode(x, this); + updateNode(y, this); + } +} + +void RBTree::rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode *y; + + if (z->parent==nil) return; + while (z->parent->color == RED) { + if (z->parent == z->parent->parent->left) + { + y = z->parent->parent->right; + + if (y->color==RED) + { + z->parent->color=BLACK; // Case 1a + y->color=BLACK; // Case 1a + z->parent->parent->color=RED; // Case 1a + z=z->parent->parent; // Case 1a + } + else + { + if (z==z->parent->right) + { + z=z->parent; // Case 2a + leftRotate(z, updateNode); // Case 2a + } + z->parent->color=BLACK; // Case 3a + z->parent->parent->color=RED; // Case 3a + rightRotate(z->parent->parent, updateNode); // Case 3a + } + } + else + { + + y = z->parent->parent->left; + + if (y->color==RED) + { + z->parent->color=BLACK; // Case 1b + y->color=BLACK; // Case 1b + z->parent->parent->color=RED; // Case 1b + z=z->parent->parent; // Case 1b + } + else + { + if (z==z->parent->left) + { + z=z->parent; // Case 2b + rightRotate(z, updateNode); // Case 2b + } + z->parent->color=BLACK; // Case 3b + z->parent->parent->color=RED; // Case 3b + leftRotate(z->parent->parent, updateNode); // Case 3b + } + } + if (z->parent==nil) return; + } //end of while + + root->color=BLACK; +} + + +void RBTree::rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode *w; + + while ((x != root) && (x->color == BLACK)) + { + if (x == x->parent->left) + { + w=x->parent->right; + + if (w->color == RED) + { + w->color=BLACK; // Case 1a + x->parent->color=RED; // Case 1a + leftRotate(x->parent, updateNode); // Case 1a + w=x->parent->right; // Case 1a + } + if ((w->left->color == BLACK) && (w->right->color == BLACK)) + { + w->color=RED; // Case 2a + x=x->parent; // Case 2a + } + else + { + if (w->right->color == BLACK) + { + w->left->color=BLACK; // Case 3a + w->color=RED; // Case 3a + rightRotate(w, updateNode); // Case 3a + w=x->parent->right; // Case 3a + } + w->color=x->parent->color; // Case 4a + x->parent->color=BLACK; // Case 4a + w->right->color=BLACK; // Case 4a + leftRotate(x->parent, updateNode); // Case 4a + x=root; // Case 4a + } + } + else + { + w=x->parent->left; + + if (w->color == RED) + { + w->color=BLACK; // Case 1b + x->parent->color=RED; // Case 1b + rightRotate(x->parent, updateNode); // Case 1b + w=x->parent->left; // Case 1b + } + if ((w->right->color == BLACK) && (w->left->color == BLACK)) + { + w->color=RED; // Case 2b + x=x->parent; // Case 2b + } + else + { + if (w->left->color == BLACK) + { + w->right->color=BLACK; // Case 3b + w->color=RED; // Case 3b + leftRotate(w, updateNode); // Case 3b + w=x->parent->left; // Case 3b + } + + w->color=x->parent->color; // Case 4b + x->parent->color=BLACK; // Case 4b + w->left->color=BLACK; // Case 4b + rightRotate(x->parent, updateNode); // Case 4b + + x=root; // Case 4b + } + } + } // end of while + x->color=BLACK; +} + +// quite similar to Cormen et al +void RBTree::rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode *y,*x,*y_oldParent; + y_oldParent=nil; + if (z->left == nil || z->right == nil) { + y = z; //z has no or one child, deletion is easy + } else { + y = treeSuccessor(z); + y_oldParent=y->parent; + } + + if (y->left != nil) { + x=y->left; + } else { + x=y->right; + } + + x->parent=y->parent; + + // cut out y: + if (y->parent == nil) { + root=x; + } else { + if (isLeftChild(y)) {y->parent->left = x;} + else {y->parent->right = x;} + } + + RBNodecolor old_y = y->color; + if (y != z) { // 2 children + //move y to z's old position and delete z + if (root==z) { + root=y; + } else { + if (isLeftChild(z)) z->parent->left = y; + else z->parent->right = y; + } + + y->parent = z->parent; + y->left = z->left; + y->right = z->right; + y->color = z->color; // don't forget to delete z after rbDelete + + y->left->parent = y; + y->right->parent = y; + + } + + + if (old_y == BLACK) { + rbDeleteFixup(x, updateNode); + } + + updateNode(y, this); + if (y_oldParent!=nil) updateNode(y_oldParent, this); + +} + + +RBNode* RBTree::treeSuccessor(RBNode *x){ + if (x->right != nil) return treeMinimum(x->right); + + RBNode *y = x->parent; + while ((y!=nil) && (x == y->right)) { + x=y; + y=y->parent; + } + return y; +} + +RBNode* RBTree::treePredecessor(RBNode *x){ + if (x->left != nil) return treeMaximum(x->left); + + RBNode *y = x->parent; + while ((y!=nil) && (x == y->left)) { + x=y; + y=y->parent; + } + return y; +} + + +RBNode* RBTree::treeMinimum(RBNode *x){ + while (x->left != nil) x=x->left; + return x; +} + +RBNode* RBTree::treeMaximum(RBNode *x){ + while (x->right != nil) x=x->right; + return x; +} + + +bool RBTree::isLeftChild(RBNode *n){ + #ifndef NDEBUG + if (n->parent == nil) { + cerr << "error: isLeftChild, no parent." << endl; + exit(EXIT_FAILURE); + } + #endif + return (n->parent->left == n); +} + +bool RBTree::isRightChild(RBNode *n){ + #ifndef NDEBUG + if ( n->parent == nil) { + cerr << "error: isLeftChild, no parent." << endl; + exit(EXIT_FAILURE); + } + #endif + return (n->parent->right == n); +} + + +RBNode* RBTree::findRightSiblingLeaf(RBNode *n){ + // go up: + while (true) { + if (n->parent!=nil) { + if (n==n->parent->right) + n=n->parent; + else + break; + } else + return nil; + } + + n=n->parent; + + // leftmost leaf in n is the right sibling searched + n=n->right; + + // go down: + while (n->left != nil) { + n=n->left; + } + + return n; +} + +RBNode* RBTree::findLeftSiblingLeaf(RBNode *n){ + // go up: + while (true) { + if (n->parent!=nil) { + if (n==n->parent->left) + n=n->parent; + else + break; + } else + return nil; + } + + n=n->parent; + + // rightmost leaf in n is the left sibling searched + n=n->left; + + // go down: + while (n->right != nil) { + n=n->right; + } + + return n; +} + + +int RBTree::getNodeMaxDepth(RBNode *n) { + int l; + int r; + if (n->left == nil) l=0; + else l=getNodeMaxDepth(n->left); + if (n->right == nil) r=0; + else r=getNodeMaxDepth(n->right); + + return (1+(l>r?l:r)); +} + +int RBTree::getNodeMinDepth(RBNode *n) { + int l; + int r; + if (n->left == 0) l=0; + else l=getNodeMinDepth(n->left); + if (n->right == 0) r=0; + else r=getNodeMinDepth(n->right); + + return (1+(l>r?r:l)); +} + + +void RBTree::printSubTree(RBNode *n){ + cout << ((n->color==BLACK)?"B":"R") << n << " ["; + if (n->left==nil) cout << "N"; + else printSubTree(n->left); + cout << ","; + if (n->right==nil) cout << "N"; + else printSubTree(n->right); + cout << "]"; +} + + +void RBTree::checkSubTree(RBNode *n){ + + if (n->left!=nil) { + if (n->left->parent != n) { + cout << "au"<< endl; + exit(1); + } + checkSubTree(n->left); + } + + if (n->right!=nil) { + if (n->right->parent != n) { + cout << "au"<< endl; + exit(1); + } + checkSubTree(n->right); + } + +} + + +void RBTree::checkNode(RBNode *n){ + if (n->left!=nil) { + if (n->left->parent != n) { + cout << "au"<< endl; + exit(1); + } + + } + + if (n->right!=nil) { + if (n->right->parent != n) { + cout << "au"<< endl; + exit(1); + } + } +} diff --git a/rbtree.h b/rbtree.h new file mode 100644 index 0000000..b59c0a3 --- /dev/null +++ b/rbtree.h @@ -0,0 +1,116 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +// this is a red-black tree implementation by wolfgang Gerlach based on the algorithm provided by +// Cormen et al.: Introduction to Algorithms, Second Edition. MIT Press and McGraw-Hill, 2001 + + + +#ifndef RBTree +#define RBTree RBTree + + + + +enum RBNodecolor{BLACK,RED}; + + +// generic Red-Black Tree Node: +class RBNode +{ + public: + RBNode* parent; + RBNode* left; + RBNode* right; // 3*4 bytes + + enum RBNodecolor color; // due to structure alignment: 4 bytes !!! + + RBNode(){}; + + RBNode(RBNode *n) + : parent(n), left(n), right(n){ + color=RED; + } + + + virtual ~RBNode(){} //adds 4 bytes vtable + + void countBlack(int i){ + if (this->color == BLACK) i++; + if (this->left != this) this->left->countBlack(i); + else std::cout << i << ","; + if (this->right != this) this->right->countBlack(i); + else std::cout << i << ","; + } +}; + +class RBTree{ + public: + + RBNode *root; + RBNode *nil; + + + virtual ~RBTree(){ + // Don't double free! + if (root != this->nil) + deleteNode(root); + root = 0; + delete this->nil; + this->nil = 0; + } + + void checkTree(); + + void rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T)); + void rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T)); + void rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T)); + RBNode* findRightSiblingLeaf(RBNode *n); + RBNode* findLeftSiblingLeaf(RBNode *n); + RBNode* treeSuccessor(RBNode *x); + RBNode* treePredecessor(RBNode *x); + RBNode* treeMinimum(RBNode *x); + RBNode* treeMaximum(RBNode *x); + + bool isLeftChild(RBNode *n); + bool isRightChild(RBNode *n); + + int getNodeMaxDepth(RBNode *n); + int getNodeMinDepth(RBNode *n); + + void printSubTree(RBNode *n); + void checkSubTree(RBNode *n); + void checkNode(RBNode *x); + + void deleteNode(RBNode* x){ + if (x->left!=nil && x->left) deleteNode(x->left); + if (x->right!=nil && x->right) deleteNode(x->right); + delete x; + } + + + private: + void leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)); + void rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)); + +}; + +#endif diff --git a/testTextCollection.cpp b/testTextCollection.cpp new file mode 100644 index 0000000..d005b8f --- /dev/null +++ b/testTextCollection.cpp @@ -0,0 +1,69 @@ +// Test driver for text collection +#include +using std::cout; +using std::endl; + +#include "TextCollection.h" +using SXSI::TextCollection; + +void printDocumentResult(TextCollection::document_result dr) +{ + TextCollection::document_result::iterator it; + printf("document result:"); + for (it = dr.begin(); it != dr.end(); ++it) + printf(" %i", *it); + printf("\n"); + fflush(stdout); +} + +void printFullResult(TextCollection::full_result fr) +{ + TextCollection::full_result::iterator it; + printf("full result:"); + for (it = fr.begin(); it != fr.end(); ++it) + printf(" (%i, %lu)", (*it).first, (*it).second); + printf("\n"); + fflush(stdout); +} + +int main() +{ + uchar *text = (uchar*) "acabab"; + TextCollection *csa = TextCollection::InitTextCollection(1); + csa->InsertText(text); + text = (uchar*) "abaca"; + csa->InsertText(text); + text = (uchar*) "abacb"; + csa->InsertText(text); + + csa->MakeStatic(); + + text = csa->GetText(0); + cout << "Text 0: \"" << text << "\"" << endl; + delete [] text; + text = csa->GetText(1); + cout << "Text 1: \"" << text << "\"" << endl; + delete [] text; + text = csa->GetText(2); + cout << "Text 2: \"" << text << "\"" << endl; + delete [] text; + + text = csa->GetText(2, 2, 4); + cout << "Substring of Text 3: \"" << text << "\"" << endl; + delete [] text; + + printf("n:o contains: %u\n", csa->CountContains((uchar *)"ac")); + printf("n:o suffix: %u\n", csa->CountSuffix((uchar *)"b")); + printf("n:o equal: %u\n", csa->CountEqual((uchar *)"acabab")); + printf("is equal: %u\n", csa->IsEqual((uchar *)"abacb")); + + TextCollection::document_result dr; + dr = csa->Contains((uchar*)"ab"); + printDocumentResult(dr); + + TextCollection::full_result fr; + fr = csa->FullContains((uchar *)"ab"); + printFullResult(fr); + + delete csa; +} -- 2.17.1