From: kim Date: Mon, 24 Nov 2008 23:32:08 +0000 (+0000) Subject: Initial Import X-Git-Url: http://git.nguyen.vg/gitweb/?a=commitdiff_plain;h=7d27a4450ed429e3b63e9d3ba7217a28cbbf9a31;p=SXSI%2FTextCollection.git Initial Import git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@6 3cdefd35-fc62-479d-8e8d-bae585ffb9ca --- 7d27a4450ed429e3b63e9d3ba7217a28cbbf9a31 diff --git a/BitRank.cpp b/BitRank.cpp new file mode 100644 index 0000000..61dcc42 --- /dev/null +++ b/BitRank.cpp @@ -0,0 +1,300 @@ +#include "BitRank.h" + +/***************************************************************************** + * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. * + * * + * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of the GNU Lesser General Public * + * License as published by the Free Software Foundation; either * + * version 2.1 of the License, or (at your option) any later version. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * + * Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License along with this library; if not, write to the Free Software * + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * + ****************************************************************************/ + +// Modified by Niko Välimäki + +///////////// +//Rank(B,i)// +///////////// +//This Class use a superblock size of 256-512 bits +//and a block size of 32-64 bits also + + +const unsigned char __popcount_tab[] = +{ +0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7, +3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8, +}; + +const unsigned char select_tab[] = +{ +0,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +8,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, + +6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1, +}; + + +// bits needed to represent a number between 0 and n +inline ulong bits (ulong n){ + ulong b = 0; + while (n) { b++; n >>= 1; } + return b; +} + +#if W == 32 + // 32 bit version + inline unsigned popcount (register ulong x){ + return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff]; + } +#else + // 64 bit version + inline unsigned popcount (register ulong x){ + return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff] + __popcount_tab[(x >> 32) & 0xff] + __popcount_tab[(x >> 40) & 0xff] + __popcount_tab[(x >> 48) & 0xff] + __popcount_tab[(x >> 56) & 0xff]; + } +#endif + +inline unsigned popcount16 (register int x){ + return __popcount_tab[x & 0xff] + __popcount_tab[(x >> 8) & 0xff]; +} + +inline unsigned popcount8 (register int x){ + return __popcount_tab[x & 0xff]; +} + +BitRank::BitRank(ulong *bitarray, ulong n, bool owner) { + data=bitarray; + this->owner = owner; + this->n=n; // length of bitarray in bits + b = W; // b is a word + s=b*superFactor; + ulong aux=(n+1)%W; + if (aux != 0) + integers = (n+1)/W+1; + else + integers = (n+1)/W; + BuildRank(); +} + +BitRank::~BitRank() { + delete [] Rs; + delete [] Rb; + if (owner) delete [] data; +} + +//Build the rank (blocks and superblocks) +void BitRank::BuildRank() +{ + ulong num_sblock = n/s; + ulong num_block = n/b; + Rs = new ulong[num_sblock+1];//+1 we add the 0 pos + Rb = new uchar[num_block+1];//+1 we add the 0 pos + + ulong j; + Rs[0] = 0lu; + + for (j=1;j<=num_sblock;j++) + { + Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1]; + } + + Rb[0]=0; + for (ulong k=1;k<=num_block;k++) { + j = k / superFactor; + Rb[k]=BuildRankSub(j*superFactor, k%superFactor); + } +} + +ulong BitRank::BuildRankSub(ulong ini, ulong bloques){ + ulong rank=0,aux; + + + for(ulong i=ini;i>8]+Rb[i>>wordShift] + +popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1)); +} + +ulong BitRank::select(ulong x) { + // returns i such that x=rank(i) && rank(i-1) integers) + return n; + + j = data[left]; + + ones = popcount(j); + } + //sequential search using popcount over a char + left=left*b; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + } + } + } + + // then sequential search bit a bit + while (x>0) { + if (j&1lu) x--; + j=j>>1; + left++; + } + return left-1; +} + +ulong BitRank::select0(ulong x) { + // returns i such that x=rank0(i) && rank0(i-1) integers) + return n; + + j = data[left]; + zeros = W - popcount(j); + } + + //sequential search using popcount over a char + left=left*b; + rankmid = 8 - popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = 8 - popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + rankmid = 8 - popcount8(j); + if (rankmid < x) { + j=j>>8; + x-=rankmid; + left+=8; + } + } + } + + // then sequential search bit a bit + while (x>0) { + if (!(j&1lu)) x--; + j=j>>1; + left++; + } + return left - 1; +} + + +bool BitRank::IsBitSet(ulong i) { + return (1lu << (i % W)) & data[i/W]; +} + +ulong BitRank::NumberOfBits() { + return n; +} diff --git a/BitRank.h b/BitRank.h new file mode 100644 index 0000000..c7783d2 --- /dev/null +++ b/BitRank.h @@ -0,0 +1,56 @@ +/***************************************************************************** + * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. * + * * + * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. * + * * + * This library is free software; you can redistribute it and/or * + * modify it under the terms of the GNU Lesser General Public * + * License as published by the Free Software Foundation; either * + * version 2.1 of the License, or (at your option) any later version. * + * * + * This library is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * + * Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public * + * License along with this library; if not, write to the Free Software * + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA * + ****************************************************************************/ + +#ifndef _BITSTREAM_H_ +#define _BITSTREAM_H_ + +#include "Tools.h" + +class BitRank { +private: + // Check word length +#if W == 32 + static const unsigned wordShift = 5; + static const unsigned superFactor = 8; // 256 bit blocks +#else + static const unsigned wordShift = 6; + static const unsigned superFactor = 4; // 256 bit blocks +#endif + + ulong *data; //here is the bit-array + bool owner; + ulong n,integers; + unsigned b,s; + ulong *Rs; //superblock array + uchar *Rb; //block array + ulong BuildRankSub(ulong, ulong); //internal use of BuildRank + void BuildRank(); //crea indice para rank +public: + BitRank(ulong *, ulong, bool); + ~BitRank(); //destructor + ulong rank(ulong i); //Rank from 0 to n-1 + ulong select(ulong x); // gives the position of the x:th 1. + ulong select0(ulong x); // gives the position of the x:th 0. + + bool IsBitSet(ulong i); + ulong NumberOfBits(); +}; + +#endif diff --git a/CSA.cpp b/CSA.cpp new file mode 100644 index 0000000..a15c733 --- /dev/null +++ b/CSA.cpp @@ -0,0 +1,976 @@ +/****************************************************************************** + * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * + *****************************************************************************/ +#include "CSA.h" +#include "TextCollection.h" +#include +#include +#include +#include +#include +#include +#include +#include // For strlen() +using std::vector; +using std::pair; +using std::make_pair; +using std::map; + +using SXSI::TextCollection; + + +//////////////////////////////////////////////////////////////////////////// +// Class CSA::THuffAlphabetRank + +CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) { + left = NULL; + right = NULL; + bitrank = NULL; + ch = s[0]; + leaf = false; + this->codetable = codetable; + + bool *B = new bool[n]; + TextPosition sum=0,i; + /* + for (i=0; i< n; i++) { + printf("%c:", (char)((int)s[i]-128)); + for (r=0;r 100) return;*/ + for (i=0;i 0) + sfirst = new uchar[n-sum]; + //if (sum > 0) + ssecond = new uchar[sum]; + unsigned j=0,k=0; + for (i=0;i 0) { + left = new THuffAlphabetRank(sfirst,j,codetable,level+1); + delete [] sfirst; + //} + //if (k>0) { + right = new THuffAlphabetRank(ssecond,k,codetable,level+1); + delete [] ssecond; + //} +} + + +bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) { + // testing that the code works correctly + int C[256]; + unsigned i,j; + bool correct=true; + for (j=0;j<256;j++) + C[j] = 0; + for (i=0;i%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i)); + } + } + return correct; +} + +CSA::THuffAlphabetRank::~THuffAlphabetRank() { + if (left!=NULL) delete left; + if (right!=NULL) delete right; + if (bitrank!=NULL) + delete bitrank; +} + + +//////////////////////////////////////////////////////////////////////////// +// Class CSA + +/** + * Constructor inits an empty dynamic FM-index. + * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE. + */ +CSA::CSA(unsigned samplerate) + : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0), + codetable(0), dynFMI(0) +{ + this->samplerate = samplerate; + + // FIXME TODO : DynFMI needs distribution of characters before hand + // This will create fully balanced wavelet tree for all chars [0, 255]. + uchar temp[256]; + for (unsigned i = 0; i < 255; ++i) + temp[i] = i+1; + temp[255] = 0; + dynFMI = new DynFMI(temp, 1, 255, false); +} + +/** + * Insert new text + * + * Given text must include end-marker. + * Text identifiers are numbered starting from 0. + */ +void CSA::InsertText(uchar const * text) +{ + // Sanity check: + assert(dynFMI != 0); + + TextPosition m = std::strlen((char *)text) + 1; + // Store text length and starting position + textLength.push_back(make_pair(m, n)); + this->n += m; + dynFMI->addText(text, m); +} + +void CSA::MakeStatic() +{ + // Sanity check: + if (dynFMI == 0) + { + std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl; + std::exit(0); + } + + uchar *bwt = dynFMI->getBWT(); + /*printf("1234567890123456789\n"); + for (TextPosition i = 0; i < n; i ++) + if (bwt[i] < 50) + printf("%d", (int)bwt[i]); + else + printf("%c", bwt[i]); + printf("\n"); + */ + +/* for (TextPosition i = 1; i <= n; i ++) + { + printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i)); + }*/ + + // Sanity check + assert(textLength.size() == dynFMI->getCollectionSize()); + + delete dynFMI; + dynFMI = 0; + + ulong i, min = 0, + max; + for (i=0;i<256;i++) + C[i]=0; + for (i=0;i0) {min = i; break;} + for (i=255;i>=min;--i) + if (C[i]>0) {max = i; break;} + + // Print frequencies +/* for(i = 0; i < 256; i++) + if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]); + fflush(stdout);*/ + + ulong prev=C[0], temp; + C[0]=0; + for (i=1;i<256;i++) { + temp = C[i]; + C[i]=C[i-1]+prev; + prev = temp; + } + this->codetable = node::makecodetable(bwt,n); + alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0); + //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n"); + +/* for (i = 0; i < n; ++i) + { + uchar c = alphabetrank->charAtPos(i); + TextPosition j = C[c]+alphabetrank->rank(c, i)-1; + printf("LF[%lu] = %lu\n", i, j); + }*/ + + // Calculate BWT end-marker position (of last inserted text) + i = 0; + while (bwt[i] != '\0') + { + uchar c = alphabetrank->charAtPos(i); + i = C[c]+alphabetrank->rank(c, i)-1; + } + bwtEndPos = i; + //printf("bwtEndPos = %lu\n", bwtEndPos); + + delete [] bwt; + + // Make sampling tables + maketables(); + // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small + // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+... + //printf("FMindex takes %d B\n", + // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8); +} + + +uchar* CSA::GetText(DocId k) const +{ + assert(k < (DocId)textLength.size()); + + uchar* result = new uchar[textLength[k].first]; + TextPosition i = k; + TextPosition j = textLength[k].first-1; + result[j] = '\0'; + + uchar c = alphabetrank->charAtPos(i); + while (c != '\0') + { + --j; + result[j] = c; + i = C[c]+alphabetrank->rank(c,i)-1; + + c = alphabetrank->charAtPos(i); // "next" char. + } + assert(j == 0); + return result; +} + +uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const +{ + assert(k < (DocId)textLength.size()); + assert(j < textLength[k].first); + assert(i <= j); + + // Start position of k'th text + ulong start = textLength[k].second; + + return Substring(i + start, j-i+1); +} + +/** + * Existential queries + */ +bool CSA::IsPrefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Check for end-marker(s) in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + if (it != endmarkers.end() && it->first <= ep) + return true; + return false; +} + +bool CSA::IsSuffix(uchar const *pattern) const +{ + // Here counting is as fast as isSuffix(): + if (CountSuffix(pattern) > 0) + return true; + return false; +} + +bool CSA::IsEqual(uchar const *pattern) const +{ + TextPosition m = std::strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Check for end-marker(s) in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + if (it != endmarkers.end() && it->first <= ep) + return true; + return false; +} + +bool CSA::IsContains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return true; + + TextPosition sp = 0, ep = 0; + // Just check if pattern exists somewhere + ulong count = Search(pattern, m, &sp, &ep); + + if (count > 0) + return true; + return false; +} + +bool CSA::IsLessThan(uchar const*) const +{ + // TODO + return false; +} + +/** + * Counting queries + */ +unsigned CSA::CountPrefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + // Count end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + ++ count; + ++ it; + } + + return count; +} + +unsigned CSA::CountSuffix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + // Search with end-marker + unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep); + + return count; +} + +unsigned CSA::CountEqual(uchar const *pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return textLength.size(); + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + // Count end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + ++ count; + ++ it; + } + + return count; +} + +unsigned CSA::CountContains(uchar const * pattern) const +{ + // Here counting is as slow as fetching the result set + // because we would have to filter out occ's that fall within same document. + TextCollection::document_result result = Contains(pattern); + return result.size(); +} + +unsigned CSA::CountLessThan(uchar const * pattern) const +{ + // TODO + return 0; +} + +/** + * Document reporting queries + */ +TextCollection::document_result CSA::Prefix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + Search(pattern, m, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + // End-marker we found belongs to the "previous" doc in the collection + DocId docId = ((it->second).first + 1) % textLength.size(); + result.push_back(docId); + ++ count; + ++ it; + } + + return result; +} + +TextCollection::document_result CSA::Suffix(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search with end-marker + Search(pattern, m+1, &sp, &ep); + + TextCollection::document_result result; + + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->charAtPos(i); + while (c != '\0' && !sampled->IsBitSet(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->charAtPos(i); + } + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + // End-marker that we found belongs to the "previous" doc in collection: + DocId docId = (endm.first + 1) % textLength.size(); + result.push_back(docId); + } + else + { + TextPosition textPos = suffixes[sampled->rank(i)-1]; + result.push_back(DocIdAtTextPos(textPos)); + } + } + + return result; +} + +TextCollection::document_result CSA::Equal(uchar const *pattern) const +{ + TextPosition m = strlen((char const *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Match including end-marker + Search(pattern, m+1, &sp, &ep); + + TextCollection::document_result result; + + // Report end-markers in result interval + map >::const_iterator it; + it = endmarkers.lower_bound(sp); + unsigned count = 0; + while (it != endmarkers.end() && it->first <= ep) + { + // End-marker that we found belongs to the "previous" doc in collection: + DocId docId = ((it->second).first + 1) % textLength.size(); + result.push_back(docId); + ++ count; + ++ it; + } + + return result; +} + + +TextCollection::document_result CSA::Contains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return TextCollection::document_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + // We want unique document indentifiers, using std::set to collect them + std::set resultSet; + + // Check each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + uchar c = alphabetrank->charAtPos(i); + while (c != '\0' && !sampled->IsBitSet(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->charAtPos(i); + } + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + // End-marker we found belongs to the "previous" doc in collection + DocId docId = (endm.first + 1) % textLength.size(); + resultSet.insert(docId); + } + else + { + TextPosition textPos = suffixes[sampled->rank(i)-1]; + resultSet.insert(DocIdAtTextPos(textPos)); + } + } + + // Convert std::set to std::vector (TODO better way to construct result vector?) + TextCollection::document_result result; + for (std::set::iterator it = resultSet.begin(); it != resultSet.end(); ++it) + result.push_back(*it); + return result; +} + +TextCollection::document_result CSA::LessThan(uchar const * pattern) const +{ + // TODO + return document_result(); +} + +/** + * Full result set queries + */ +TextCollection::full_result CSA::FullContains(uchar const * pattern) const +{ + TextPosition m = strlen((char *)pattern); + if (m == 0) + return full_result(); + + TextPosition sp = 0, ep = 0; + // Search all occurrences + Search(pattern, m, &sp, &ep); + + full_result result; + + // Report each occurrence + for (; sp <= ep; ++sp) + { + TextPosition i = sp; + TextPosition dist = 0; + uchar c = alphabetrank->charAtPos(i); + while (c != '\0' && !sampled->IsBitSet(i)) + { + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + c = alphabetrank->charAtPos(i); + ++ dist; + } + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + // End-marker we found belongs to the "previous" doc in the collection: + DocId docId = (endm.first+1)%textLength.size(); + result.push_back(make_pair(docId, dist)); + } + else + { + TextPosition textPos = suffixes[sampled->rank(i)-1] + dist; + // Read document identifier and its starting position in text order + DocId docId = DocIdAtTextPos(textPos); + result.push_back(make_pair(docId, textPos - textLength[docId].second)); + } + } + + return result; +} + + +/** + * Finds document identifier for given text position + * + * Starting text position of the document is stored into second parameter. + * Binary searching on text starting positions. + */ +TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const +{ + assert(i < n); + + DocId a = 0; + DocId b = textLength.size() - 1; + while (a < b) + { + DocId c = a + (b - a)/2; + if (textLength[c].second > i) + b = c - 1; + else if (textLength[c+1].second > i) + return c; + else + a = c + 1; + } + + assert(a < (DocId)textLength.size()); + assert(i > textLength[a].second); + assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second)); + return a; +} + +void CSA::Load(FILE *filename, unsigned samplerate) +{ + // TODO + +} + +void CSA::Save(FILE *filename) +{ + // TODO +} + + +/** + * Rest of the functions follow... + */ + +uchar * CSA::Substring(TextPosition i, TextPosition l) const +{ + uchar *result = new uchar[l + 1]; + if (l == 0) + { + result[0] = 0u; + return result; + } + + TextPosition dist; + TextPosition k = i + l - 1; + // Check for end of the string + if (k > n - 1) + { + l -= k - n + 1; + k = n - 1; + } + + TextPosition skip = samplerate - k % samplerate - 1; + TextPosition j; + if (k / samplerate + 1 >= n / samplerate) + { + j = bwtEndPos; + skip = n - k - 1; + } + else + { + j = positions[k/samplerate+1]; + //cout << samplerate << ' ' << j << '\n'; + } + + for (dist = 0; dist < skip + l; dist++) + { + int c = alphabetrank->charAtPos(j); + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(j))->second; + j = endm.first; // LF-mapping for end-marker + } + else + j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping + if (dist >= skip) + result[l + skip - dist - 1] = c; + } + result[l] = 0u; + return result; +} + + /*TextPosition CSA::inverse(TextPosition i) +{ + TextPosition skip = samplerate - i % samplerate; + TextPosition j; + if (i / samplerate + 1 >= n / samplerate) + { + j = bwtEndPos; + skip = n - i; + } + else + { + j = positions[i/samplerate+1]; + //cout << samplerate << ' ' << j << '\n'; + } + + while (skip > 0) + { + int c = alphabetrank->charAtPos(j); + j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping + skip --; + } + return j; + }*/ + +ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const { + // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i) + int c = (int)pattern[m-1]; + TextPosition i=m-1; + TextPosition sp = C[c]; + TextPosition ep = C[c+1]-1; + while (sp<=ep && i>=1) + { +// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep); + c = (int)pattern[--i]; + sp = C[c]+alphabetrank->rank(c,sp-1); + ep = C[c]+alphabetrank->rank(c,ep)-1; + } + *spResult = sp; + *epResult = ep; + if (sp<=ep) + return ep - sp + 1; + else + return 0; + } + + /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) +{ + sp = C[(int)c]+alphabetrank->rank(c,sp-1); + ep = C[(int)c]+alphabetrank->rank(c,ep)-1; + + if (sp<=ep) + return ep - sp + 1; + else + return 0; + }*/ + +CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma) +{ + TextPosition dist=0; + while (!sampled->IsBitSet(i)) + { + int c = alphabetrank->charAtPos(i); + if (c == '\0') + { + // map::operator[] is not const, using find() instead: + pair endm = (endmarkers.find(i))->second; + return endm.second + dist; // returns text position. + } + i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping + ++dist; + } + + return suffixes[sampled->rank(i)-1]+dist; +} + +CSA::~CSA() { + delete alphabetrank; + delete sampled; + delete [] suffixes; + delete [] positions; + delete [] codetable; +} + + +void CSA::maketables() +{ + ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1; + + ulong *sampledpositions = new ulong[n/W+1]; + suffixes = new ulong[sampleLength]; + positions = new ulong[sampleLength]; + + ulong i,j=0; + for (i=0;iGetPos(i) + x=(i==n-1)?0:i+1; + + if (x % samplerate == 0) { + Tools::SetField(sampledpositions,1,p,1); + positions[x/samplerate] = p; + } + + uchar c = alphabetrank->charAtPos(p); + if (c == '\0') + { + // Record the order of end-markers in BWT: + --textId; + endmarkers[p] = make_pair(textId, (TextPosition)x); + // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis). + p = textId; // Correct LF-mapping to the last char of the previous text. + } + else + p = C[c]+alphabetrank->rank(c, p)-1; + } + assert(textId == 0); + /*for (map >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it) + { + printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second); + }*/ + + sampled = new BitRank(sampledpositions,n,true); + + //suffixes: + for(i=0; irank(positions[i]); + if (j==0) j=sampleLength; + suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate; + } +} + +uchar * CSA::LoadFromFile(const char *filename) +{ + uchar *s; + std::ifstream file (filename, std::ios::in|std::ios::binary); + if (file.is_open()) + { + std::cerr << "Loading CSA from file: " << filename << std::endl; + file.read((char *)&bwtEndPos, sizeof(TextPosition)); + s = new uchar[n]; + for (ulong offset = 0; offset < n; offset ++) + file.read((char *)(s + offset), sizeof(char)); + file.close(); + } + else + { + std::cerr << "Unable to open file " << filename << std::endl; + exit(1); + } + return s; +} + +void CSA::SaveToFile(const char *filename, uchar *bwt) +{ + std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc); + if (file.is_open()) + { + std::cerr << "Writing CSA to file: " << filename << std::endl; + file.write((char *)&bwtEndPos, sizeof(TextPosition)); + std::cerr << "Writing BWT of " << n << " bytes." << std::endl; + for (ulong offset = 0; offset < n; offset ++) + file.write((char *)(bwt + offset), sizeof(char)); + file.close(); + } + else + { + std::cerr << "Unable to open file " << filename << std::endl; + exit(1); + } +} + +/*uchar * CSA::BWT(uchar *text) +{ + uchar *s; + + DynFMI *wt = new DynFMI((uchar *) text, n); + s = wt->getBWT(); + for (ulong i=0;i, std::greater > q; +// +// First I push all the leaf nodes into the queue +// + for ( unsigned int i = 0 ; i < 256 ; i++ ) + if ( result[ i ].count ) + q.push(node( i, result[ i ].count ) ); +// +// This loop removes the two smallest nodes from the +// queue. It creates a new internal node that has +// those two nodes as children. The new internal node +// is then inserted into the priority queue. When there +// is only one node in the priority queue, the tree +// is complete. +// + + while ( q.size() > 1 ) { + node *child0 = new node( q.top() ); + q.pop(); + node *child1 = new node( q.top() ); + q.pop(); + q.push( node( child0, child1 ) ); + } +// +// Now I compute and return the codetable +// + q.top().maketable(0u,0u, result); + q.pop(); + return result; +} + + +void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const +{ + if ( child0 ) + { + child0->maketable( SetBit(code,bits,0), bits+1, codetable ); + child1->maketable( SetBit(code,bits,1), bits+1, codetable ); + delete child0; + delete child1; + } + else + { + codetable[value].code = code; + codetable[value].bits = bits; + } +} + +void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts ) +{ + ulong i; + for (i = 0 ; i < 256 ; i++ ) + counts[ i ].count = 0; + for (i=0; i +#include +#include "BitRank.h" +#include "dynFMI.h" +#include "TextCollection.h" + +/** + * Implementation of the TextCollection interface + * + * Implementation notes: + * + * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree + * which requires O(n log \sigma) bits of memory. If we know the distribution + * of characters beforehand, space can easily be made O(nH_0). + * + * The method MakeStatic() constructs a Succinct Suffix Array with a huffman + * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME) + * + * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used + * with any FM-index variant supporting LF-mapping. + */ +class CSA : public SXSI::TextCollection { +public: + /** + * Constructor with (default) samplerate + */ + explicit CSA(unsigned); + ~CSA(); + /** + * Following functions implement the interface described in TextCollection.h. + * For details/documentation, see TextCollection.h. + */ + void InsertText(uchar const *); + void MakeStatic(); + uchar* GetText(DocId) const; + uchar* GetText(DocId, TextPosition, TextPosition) const; + + bool IsPrefix(uchar const *) const; + bool IsSuffix(uchar const *) const; + bool IsEqual(uchar const *) const; + bool IsContains(uchar const *) const; + bool IsLessThan(uchar const *) const; + + unsigned CountPrefix(uchar const *) const; + unsigned CountSuffix(uchar const *) const; + unsigned CountEqual(uchar const *) const; + unsigned CountContains(uchar const *) const; + unsigned CountLessThan(const unsigned char*) const; + + // document_result is inherited from SXSI::TextCollection. + document_result Prefix(uchar const *) const; + document_result Suffix(uchar const *) const; + document_result Equal(uchar const *) const; + document_result Contains(uchar const *) const; + document_result LessThan(uchar const *) const; + + // full_result is inherited from SXSI::TextCollection. + full_result FullContains(uchar const *) const; + + // TODO implement: + void Load(FILE *, unsigned); + void Save(FILE *); + +private: + class TCodeEntry { + public: + unsigned count; + unsigned bits; + unsigned code; + TCodeEntry() {count=0;bits=0;code=0u;}; + }; + + + class THuffAlphabetRank { + // using fixed 0...255 alphabet + private: + BitRank *bitrank; + THuffAlphabetRank *left; + THuffAlphabetRank *right; + TCodeEntry *codetable; + uchar ch; + bool leaf; + public: + THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); + ~THuffAlphabetRank(); + bool Test(uchar *, TextPosition); + + inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i + THuffAlphabetRank const * temp=this; + if (codetable[c].count == 0) return 0; + unsigned level = 0; + unsigned code = codetable[c].code; + while (!temp->leaf) { + if ((code & (1u<bitrank->rank(i); + temp = temp->left; + } + else { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } + ++level; + } + return i+1; + }; + inline bool IsCharAtPos(int c, TextPosition i) const { + THuffAlphabetRank const * temp=this; + if (codetable[c].count == 0) return false; + unsigned level = 0; + unsigned code = codetable[c].code; + while (!temp->leaf) { + if ((code & (1u<bitrank->IsBitSet(i)) return false; + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + else { + if (!temp->bitrank->IsBitSet(i)) return false; + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } + ++level; + } + return true; + } + inline int charAtPos(TextPosition i) const { + THuffAlphabetRank const * temp=this; + while (!temp->leaf) { + if (temp->bitrank->IsBitSet(i)) { + i = temp->bitrank->rank(i)-1; + temp = temp->right; + } + else { + i = i-temp->bitrank->rank(i); + temp = temp->left; + } + } + return (int)temp->ch; + } + }; + + class node { + private: + unsigned weight; + uchar value; + node *child0; + node *child1; + + void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const; + static void count_chars(uchar *, TextPosition , TCodeEntry *); + static unsigned SetBit(unsigned , unsigned , unsigned ); + public: + node( unsigned char c = 0, unsigned i = 0 ) { + value = c; + weight = i; + child0 = 0; + child1 = 0; + } + + node( node* c0, node *c1 ) { + value = 0; + weight = c0->weight + c1->weight; + child0 = c0; + child1 = c1; + } + + + bool operator>( const node &a ) const { + return weight > a.weight; + } + + static TCodeEntry *makecodetable(uchar *, TextPosition); + }; + + static const unsigned char print = 1; + static const unsigned char report = 1; + TextPosition n; + unsigned samplerate; + ulong C[256]; + TextPosition bwtEndPos; + THuffAlphabetRank *alphabetrank; + BitRank *sampled; + ulong *suffixes; + ulong *positions; + TCodeEntry *codetable; + DynFMI *dynFMI; + // Map from end-marker in BWT to pair (textId, sampled text position) + std::map > endmarkers; + // Vector of pairs of + std::vector > textLength; + + // Private methods + uchar * BWT(uchar *); + uchar * LoadFromFile(const char *); + void SaveToFile(const char *, uchar *); + void maketables(); + + // Following are not part of the public API + DocId DocIdAtTextPos(TextPosition) const; + ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; + TextPosition Lookup(TextPosition) const; + TextPosition Inverse(TextPosition) const; + TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; + TextPosition Psi(TextPosition) const; + uchar * Substring(TextPosition, TextPosition) const; +}; + +#endif diff --git a/TextCollection.cpp b/TextCollection.cpp new file mode 100644 index 0000000..0300899 --- /dev/null +++ b/TextCollection.cpp @@ -0,0 +1,16 @@ +#include "TextCollection.h" +#include "CSA.h" + +namespace SXSI +{ + /** + * Init text collection + * + * See CSA.h for more details. + */ + TextCollection * TextCollection::InitTextCollection(unsigned samplerate) + { + TextCollection *result = new CSA(samplerate); + return result; + } +} diff --git a/TextCollection.h b/TextCollection.h new file mode 100644 index 0000000..e7e36e2 --- /dev/null +++ b/TextCollection.h @@ -0,0 +1,168 @@ +/****************************************************************************** + * Copyright (C) 2008 by Niko Valimaki * + * Text collection interface for an in-memory XQuery/XPath engine * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU Lesser General Public License as published * + * by the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU Lesser General Public License for more details. * + * * + * You should have received a copy of the GNU Lesser General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ******************************************************************************/ + +#ifndef _SXSI_TextCollection_h_ +#define _SXSI_TextCollection_h_ + +#include "Tools.h" // Defines ulong and uchar. +#include +#include // Defines std::pair. + +// Default samplerate for suffix array samples +#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64 + +namespace SXSI +{ + /** + * General interface for a text collection + * + * Class is virtual, make objects by calling + * the static method InitTextCollection(). + */ + class TextCollection + { + public: + // Type of document identifier + typedef int DocId; + // Type for text position (FIXME ulong or long?) + typedef ulong TextPosition; + + /** + * Init an instance of a text collection object + * + * Returns a pointer to an object implementing this interface. + */ + static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE); + /** + * Load from a file + * + * New samplerate can be given, otherwise will use the one specified in the save file! + * Note: This is not a static method; call InitTextCollection() first to get the object handle. + * + * Throws an exception if something goes wrong (unlikely since we are passing a file descriptor). + * + */ + virtual void Load(FILE *, unsigned samplerate = 0) = 0; + /** + * Save data structure into a file + */ + virtual void Save(FILE *) = 0; + /** + * Virtual destructor + */ + virtual ~TextCollection() { }; + /** + * Insert text + * + * Must be a zero-terminated string from alphabet [1,255]. + * Can not be called after makeStatic(). + * The i'th text insertion gets an identifier value i-1. + * In other words, document identifiers start from 0. + */ + virtual void InsertText(uchar const *) = 0; + /** + * Make static + * + * Convert to a static collection; reduces space and time complexities. + * New texts can not be inserted after this operation. + */ + virtual void MakeStatic() = 0; + + /** + * Displaying content + * + * Returns the i'th text in the collection. + * The numbering starts from 0. + */ + virtual uchar* GetText(DocId) const = 0; + /** + * Returns substring [i, j] of k'th text + * + * Note: Parameters i and j are text positions inside the k'th text. + */ + virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0; + /** + * Returns backwards (reverse) iterator to the end of i'th text + * + * Note: Do we need this? + * Forward iterator would be really in-efficient compared to + * getText(k) and getText(k, i, j). + * + * TODO Define and implement const_reverse_iterator. + */ + //const_reverse_iterator rend(DocId) const; + + /** + * Existential queries + */ + // Is there a text prefixed by given string? + virtual bool IsPrefix(uchar const *) const = 0; + // Is there a text having given string as a suffix? + virtual bool IsSuffix(uchar const *) const = 0; + // Is there a text that equals given string? + virtual bool IsEqual(uchar const *) const = 0; + // Does a text contain given string? + virtual bool IsContains(uchar const *) const = 0; + // Is there a text that is lexicographically less than given string? + virtual bool IsLessThan(uchar const *) const = 0; + + /** + * Counting queries + * + * Result is the number of documents. + */ + virtual unsigned CountPrefix(uchar const *) const = 0; + virtual unsigned CountSuffix(uchar const *) const = 0; + virtual unsigned CountEqual(uchar const *) const = 0; + virtual unsigned CountContains(uchar const *) const = 0; + virtual unsigned CountLessThan(uchar const *) const = 0; + + /** + * Document reporting queries + * + * Result is a vector of document id's in some undefined order. + */ + // Data type for results + typedef std::vector document_result; + virtual document_result Prefix(uchar const *) const = 0; + virtual document_result Suffix(uchar const *) const = 0; + virtual document_result Equal(uchar const *) const = 0; + virtual document_result Contains(uchar const *) const = 0; + virtual document_result LessThan(uchar const *) const = 0; + + /** + * Full reporting queries + * + * Result is a vector of pairs in some undefined order. + */ + // Data type for results + typedef std::vector > full_result; + virtual full_result FullContains(uchar const *) const = 0; + + protected: + // Protected constructor; call the static function InitTextCollection(). + TextCollection() { }; + + // No copy constructor or assignment + TextCollection(TextCollection const&); + TextCollection& operator = (TextCollection const&); + }; +} +#endif diff --git a/Tools.cpp b/Tools.cpp new file mode 100644 index 0000000..d7260d9 --- /dev/null +++ b/Tools.cpp @@ -0,0 +1,95 @@ +/* + * Collection of basic tools and defines + */ + +#include "Tools.h" +#include + + +time_t Tools::startTime; + +void Tools::StartTimer() +{ + startTime = time(NULL); +} + +double Tools::GetTime() +{ + time_t stopTime = time(NULL); + return difftime( stopTime, startTime ); +} + +uchar * Tools::GetRandomString(unsigned min, unsigned max, unsigned &alphabetSize) +{ + unsigned len = std::rand() % (max - min) + min; + alphabetSize = std::rand() % 26 + 1; + uchar* temp = new uchar[len + 2]; + for (unsigned i = 0; i < len; i++) + temp[i] = 97 + std::rand() % alphabetSize; + temp[len] = 0u ;temp[len+1] = '\0'; + return temp; +} + + +void Tools::PrintBitSequence(ulong *A, ulong len) +{ + for(ulong i = 0; i < len; i++) + if (GetField(A, 1, i)) + std::cout << "1"; + else + std::cout << "0"; + std::cout << "\n"; +} + +unsigned Tools::FloorLog2(ulong i) +{ + unsigned b = 0; + if (i == 0) + return 0; + while (i) + { + b++; + i >>= 1; + } + return b - 1; +} + +unsigned Tools::CeilLog2(ulong i) +{ + unsigned j = FloorLog2(i); + if ((ulong)(1lu << j) != i) + return j + 1; + + return j; +} + +uchar * Tools::GetFileContents(char *filename, ulong maxSize) +{ + std::ifstream::pos_type posSize; + std::ifstream file ((char *)filename, std::ios::in|std::ios::binary|std::ios::ate); + if (file.is_open()) + { + posSize = file.tellg(); + ulong size = posSize; + if (maxSize != 0 && size > maxSize) + size = maxSize; + char *memblock = new char [size + 1]; + file.seekg (0, std::ios::beg); + file.read (memblock, size); + memblock[size] = '\0'; + file.close(); + return (uchar *)memblock; + } + else + return 0; +} + +unsigned Tools::bits (ulong n) + + { unsigned b = 0; + while (n) + { b++; n >>= 1; } + return b; + } + + diff --git a/Tools.h b/Tools.h new file mode 100644 index 0000000..121e46e --- /dev/null +++ b/Tools.h @@ -0,0 +1,110 @@ +/* + * Collection of basic tools and defines + */ + + +#ifndef _TOOLS_H_ +#define _TOOLS_H_ + +#include +#include +#include +#include + +// Generates an error if __WORDSIZE is not defined +#ifndef __WORDSIZE +#error Missing definition of __WORDSIZE; Please define __WORDSIZE in Tools.h! +#endif + +// Check word length on GNU C/C++: +// __WORDSIZE should be defined in , which is #included from +#if __WORDSIZE == 64 +# define W 64 +#else +# define W 32 +#endif + +#define WW (W*2) +#define Wminusone (W-1) + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + + +class Tools +{ +private: + static time_t startTime; +public: + static void StartTimer(); + static double GetTime(); + static uchar * GetRandomString(unsigned, unsigned, unsigned &); + static void PrintBitSequence(ulong *, ulong); + static uchar * GetFileContents(char *, ulong =0); + static ulong * bp2bitstream(uchar *); + static unsigned FloorLog2(ulong); + static unsigned CeilLog2(ulong); + static unsigned bits (ulong); + + static inline void SetField(ulong *A, register unsigned len, register ulong index, register ulong x) + { + ulong i = index * len / W, + j = index * len - i * W; + ulong mask = (j+len < W ? ~0lu << j+len : 0) + | (W-j < W ? ~0lu >> (W-j) : 0); + A[i] = (A[i] & mask) | x << j; + if (j + len > W) + { + mask = ((~0lu) << (len + j - W)); + A[i+1] = (A[i+1] & mask)| x >> (W - j); + } + } + + static inline ulong GetField(ulong *A, register unsigned len, register ulong index) + { + register ulong i = index * len / W, + j = index * len - W * i, + result; + if (j + len <= W) + result = (A[i] << (W - j - len)) >> (W - len); + else + { + result = A[i] >> j; + result = result | (A[i+1] << (WW - j - len)) >> (W - len); + } + return result; + } + + + static inline ulong GetVariableField(ulong *A, register unsigned len, register ulong index) + { + register ulong i=index/W, j=index-W*i, result; + if (j+len <= W) + result = (A[i] << (W-j-len)) >> (W-len); + else { + result = A[i] >> j; + result = result | (A[i+1] << (WW-j-len)) >> (W-len); + } + return result; + } + + static inline void SetVariableField(ulong *A, register unsigned len, register ulong index, register ulong x) { + ulong i=index/W, + j=index-i*W; + ulong mask = (j+len < W ? ~0lu << j+len : 0) + | (W-j < W ? ~0lu >> (W-j) : 0); + A[i] = (A[i] & mask) | x << j; + if (j+len>W) { + mask = ((~0lu) << (len+j-W)); + A[i+1] = (A[i+1] & mask)| x >> (W-j); + } + } + + +}; + +#endif diff --git a/bittree.cpp b/bittree.cpp new file mode 100644 index 0000000..0003461 --- /dev/null +++ b/bittree.cpp @@ -0,0 +1,989 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +#include "bittree.h" + + +using std::cout; +using std::endl; +using std::cerr; +using std::bitset; +using std::ostream; + +//std::ostream& operator<<(std::ostream& os, node& n) { ... } + + + BVTree::~BVTree(){ + delete tempbit; + tempbit = 0; + } + + + +/** +BVTree::BVTree(char *readfile){ + + std::ifstream file(readfile); + if (!file) + { + cout << "error reading file "<< readfile <<" !\n"; + exit(EXIT_FAILURE); + } + + + tempnil = (BVNode*) ((RBTree*) this)->nil; + tempbit = new bitset<2*logn>; + + + char c; + bitset<8> bs; + + while (file.get(c)) + { + bs = c; + + for (int i=7; i>=0; i--) + appendBit(bs[i]); + } + +} +**/ + + + + +void BVTree::iterateReset(){ + iterate=1; + iterateLocal=0; + if (root!=nil) { + iterateNode = (BVNode*) treeMinimum(root); + iterateRank = ((*iterateNode->block)[0])?1:0; + } else { + iterateNode=getNil(); + iterateRank = 0; + } +} + +bool BVTree::iterateGetBit(){ + return ((*iterateNode->block)[iterateLocal]); +} + +ulong BVTree::iterateGetRank(bool bit){ + if (bit) return iterateRank; + else return (iterate - iterateRank); +} + +bool BVTree::iterateNext(){ + iterate++; + + if (iterate > getPositions()) return false; + + + if (iterateLocal < iterateNode->myPositions-1) { + iterateLocal++; + } else { + // jump to next leaf; + iterateNode = (BVNode*) treeSuccessor(iterateNode); + #ifndef NDEBUG + if (iterateNode==getNil()) { + cout << "iterateNode==getNil()" << endl; + return false; + } + #endif + iterateLocal=0; + } + if ((*iterateNode->block)[iterateLocal]) iterateRank++; + + return true; +} + +ulong BVTree::getPositions(){ + + if (getRoot() == getNil()) return 0; + return getRoot()->subTreePositions; +} + +ulong BVTree::getRank(){ + + if (getRoot() == getNil()) return 0; + return getRoot()->subTreeRank; +} + + +void BVTree::printNode(BVNode *n){ + int commas=(2*logn)/10; + int commashift = -1; + + + cout << "address: " << n << endl; + + if (n->block != 0 && (n!=getNil())) + { + int size = 2*logn + 1 +commas; + + char *myblock = (char *) new char[size]; + if (myblock == 0) { + cerr << "error: printNode: myblock == 0" << endl; + exit(0); + } + + //read: i, write: i+commashift + for (int i=0; i<(int)(n->myPositions); i++){ + if (i%10 == 0) commashift++; + myblock[i+commashift]='0' + (*n->block)[i]; + if (i+commashift >= size-1) { + cerr << "X) printNode: array wrong index" << endl; + exit(0); + } + + } + + cout << "size=" << size << endl; + cout << "n->myPositions=" << n->myPositions << endl; + for (int i=n->myPositions; i<(2*logn); i++){ + if ((i%10) == 0) commashift++; + + myblock[i+commashift]='-'; + if (i+commashift > size-2) { + cerr << "A) printNode: array wrong index" << endl; + exit(0); + } + } + + commashift = 0; + for (int i=10; i < 2*logn; i++){ + if (i%10 == 0) + { + if (i+commashift >= size-2) { + cerr << "B) printNode: array wrong index" << endl; + exit(0); + } + + myblock[i+commashift]=','; + commashift++; + } + + } + + myblock[size - 1]='\0'; + + cout << "block: \"" << myblock << "\"" << endl; + delete myblock; + + } + else + cout << "block: none" << endl; + + + cout << "myPositions: " << n->myPositions << endl; + cout << "myRank: " << n->myRank << endl; + cout << "subTreePositions: " << n->subTreePositions << endl; + cout << "subTreeRank: " << n->subTreeRank << endl; + cout << "color: " << ((n->color==RED)?"RED":"BLACK") << endl; + cout << "parent: " << n->getParent() << endl; + cout << "left: " << n->getLeft() << endl; + cout << "right:" << n->getRight() << endl << endl; + +} + + +int BVTree::getTreeMaxDepth(){ + return getNodeMaxDepth(root); +} + +int BVTree::getTreeMinDepth(){ + return getNodeMinDepth(root); +} + + +void BVTree::updateCounters(BVNode *n){ + + if (n == getNil()) return; + + ulong lR = 0; + ulong lP = 0; + ulong rR = 0; + ulong rP = 0; + + if (n->getLeft() != getNil()) { + lR = n->getLeft()->subTreeRank; + lP = n->getLeft()->subTreePositions; + } + + if (n->getRight() != getNil()) { + rR = n->getRight()->subTreeRank; + rP = n->getRight()->subTreePositions; + } + + n->subTreeRank =lR + rR + n->myRank; + n->subTreePositions=lP + rP + n->myPositions; + +} + +ulong BVTree::getLocalRank(BVNode* n, ulong position){ + + #ifndef NDEBUG + if (position > n->myPositions) { + cerr << "error: getLocalRank: invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + + *tempbit =*(n->block)<<((2*logn)-position); + return tempbit->count(); + + // old version: + //rank = 0; + //for (ulong i = 0; i < position; i++) { + // if ((*n->block)[i]) rank++; + //} +} + +ulong BVTree::getLocalSelect1(BVNode* n, ulong query){ + ulong select =0; + #ifndef NDEBUG + if (query > n->myPositions) { + cerr << "error: getLocalSelect1: invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + ulong i; + for (i = 0; select < query; i++) { // TODO is there a faster solution similar to rank ? + if ((*n->block)[i]) select++; + } + + return i; +} + +ulong BVTree::getLocalSelect0(BVNode* n, ulong query){ + ulong select =0; + #ifndef NDEBUG + if (query > n->myPositions) { + cerr << "error: getLocalSelect0: invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + ulong i; + for (i = 0; select < query; i++) { + if (!(*n->block)[i]) select++; + } + + return i; +} + + +void BVTree::printNode(ulong i){ + BVNode* x = getRoot(); + + #ifndef NDEBUG + if (x == getNil()) { + cerr << "error: printNode(int i): root=NULL.\n"; + exit(EXIT_FAILURE); + + } + + if (i > x->myPositions) { + cerr << "error: printNode(int i): invalid position in block.\n"; + exit(EXIT_FAILURE); + } + #endif + + ulong lP=0; + bool search = true; + //find the corresponding block: + while (search) + { + + lP = x->getLeft()->subTreePositions; + + + if (lP >= i) + { + x=x->getLeft(); + } + else if (lP+x->myPositions >= i){ + i-=lP; + search = false; + } + else{ + i-=(lP+x->myPositions); + x=x->getRight(); + } + } + + + cout << "i=" << i << endl; + printNode(x); +} + + +bool BVTree::operator[](ulong i){ + + BVNode* x = getRoot(); + ulong lsP=0; + bool search = true; + //find the corresponding block: + while (search) + { + + lsP = x->getLeft()->subTreePositions; + + if (lsP >= i) + { + #ifndef NDEBUG + if (x->getLeft()==getNil()) { + cout << "lsP: " << lsP << endl; + printNode(x); + cout << "ihhh" << endl; + checkTree(); + exit(EXIT_FAILURE); + } + #endif + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i){ + i-=lsP; + search = false; + } + else{ + i-=(lsP+x->myPositions); + #ifndef NDEBUG + if (x->getRight()==getNil()) { + cout << "i: " << i << endl; + cout << "lsP: " << lsP << endl; + cout << "x->myPositions: " << x->myPositions << endl; + printNode(x); + cout << "ihhh" << endl; + checkTree(); + exit(EXIT_FAILURE); + } + #endif + x=x->getRight(); + } + } + + return (*x->block)[i-1]; +} + + +ulong BVTree::rank1(ulong i){ + BVNode* x = getRoot(); + + if (i == this->getPositions() + 1) i--; + #ifndef NDEBUG + if (i > this->getPositions() ) { + cerr << "error: rank1(0): invalid position in bittree: " << i << endl; + cerr << "error: rank1(0): this->getPositions(): " << this->getPositions()<< endl; + exit(EXIT_FAILURE); + } + #endif + + + ulong lsP=0; + ulong lsR=0; + ulong rank=0; + bool search = true; + //find the corresponding block: + while (search) + { + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + + if (lsP >= i) + {// cout << "L" << endl; + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i){ + i-=lsP; + rank+=lsR; + search = false; + } + else{//cout << "R" << endl; + i-=(lsP+x->myPositions); + rank+=(lsR+x->myRank); + x=x->getRight(); + } + } + + rank+=getLocalRank(x, i); + return rank; +} + +ulong BVTree::rank0(ulong i){ + if (this->getPositions() == 0) return 0; + return (i-rank1(i)); +} + +ulong BVTree::select1(ulong i){ + BVNode* x = getRoot(); + ulong select=0; + #ifndef NDEBUG + if (i > x->subTreeRank ) { + cerr << "error: select1: invalid position in bittree: " << i << endl; + exit(EXIT_FAILURE); + } + #endif + + ulong lsP=0; + ulong lsR=0; + bool search = true; + //find the corresponding block: + while (search) + { + // update subTree-counters + + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + + if (lsR >= i) { + x=x->getLeft(); + } + else if (lsR+x->myRank >= i) { + i-=lsR; + select+=lsP; + search = false; + } + else { + i-=(lsR+x->myRank); + select+=(lsP+x->myPositions); + x=x->getRight(); + } + } + select+=getLocalSelect1(x, i); + + + + return select; +} + +ulong BVTree::select0(ulong i){ + BVNode* x = getRoot(); + ulong select=0; + + #ifndef NDEBUG + if (i > (x->subTreePositions - x->subTreeRank) || i < 0) { + cerr << "error: select1: invalid position in bittree: " << i << endl; + exit(EXIT_FAILURE); + } + #endif + + + ulong lsP=0; + ulong lsR=0; + ulong lmR=0; + ulong lmP=0; + bool search = true; + //find the corresponding block: + while (search) + { + // update subTree-counters + + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + lmR = x->getLeft()->myRank; + lmP = x->getLeft()->myPositions; + + if (lsP-lsR >= i) + { + x=x->getLeft(); + } + else if ((lsP-lsR)+(x->myPositions-x->myRank) >= i){ + i-=lsP-lsR; + select+=lsP; + search = false; + } + else{ + i-=((lsP-lsR)+(x->myPositions-x->myRank)); + select+=(lsP+x->myPositions); + x=x->getRight(); + } + } + + select+=getLocalSelect0(x, i); + + return select; +} + + +void BVTree::updateCountersOnPathToRoot(BVNode *walk){ + + while (walk != getNil()) { + updateCounters(walk); + walk=walk->getParent(); + } +} + +//deletes the BVNode and all its children, destroys reb-black-tree property! +void BVTree::deleteNode(BVNode *n){ + if (n==getNil() || n == 0) return; + + if (n->getLeft() != getNil() && n->getLeft()) deleteNode(n->getLeft()); + if (n->getRight() != getNil() && n->getRight()) deleteNode(n->getRight()); + + delete n; +} + + + + +// TODO improve by returning bitvalue + +void BVTree::deleteBit(ulong i){ + ulong old_i = i; + BVNode* x = getRoot(); + bool bit; + ulong rank=0; + + #ifndef NDEBUG + if (x == getNil()) { + cerr << "error: deleteBit, root is empty\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + + if (i > x->subTreePositions || i < 1) + { + cerr << "error: A, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + + ulong lsP=0; + ulong lsR=0; + + bool search = true; + //find the corresponding block: + while (search) + { + // update of pointers is not yet possible: call updateCountersOnPathToRoot + + lsP = x->getLeft()->subTreePositions; + lsR = x->getLeft()->subTreeRank; + + + if (lsP >= i) + { + #ifndef NDEBUG + if (x->getLeft()==getNil()) exit(EXIT_FAILURE); + #endif + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i){ + i-=lsP; + rank+=lsR; + search = false; + } + else{ + i-=(lsP+x->myPositions); + rank+=(lsR+x->myRank); // for speedup! + #ifndef NDEBUG + if (x->getRight()==getNil()) exit(EXIT_FAILURE); + #endif + x=x->getRight(); + } + } + + + + // now delete the bit from the block x: + bit =(*x->block)[i-1]; + + // store bit and rank information for speedup + lastBitDeleted=bit; + rank+=getLocalRank(x, i); + lastRank=(bit?rank:old_i-rank); + + #ifndef NDEBUG + if (i > x->myPositions) { + cerr << "error: B, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + bitset<2*logn> mask; + + + if ( i > 1 ) + { + mask.set(); + mask>>=(2*logn - i + 1); + mask &= *(x->block); + (*(x->block)>>=i)<<=i-1; // shift bits by one + (*x->block)|=mask; // restore bits in front of deleted bit + } else { + *(x->block)>>=1; // shift bits by one + } + + x->myPositions--; + if (bit) x->myRank--; + + updateCountersOnPathToRoot(x); + + + if (x->myPositions == 0){ // if merging was not possible: + + rbDelete(x, callUpdateCountersOnPathToRoot); + delete x; + + } else if (x->myPositions <= logn/2) // merge (and rotate), if necessary: + { + + BVNode *sibling = (BVNode*) treeSuccessor(x); + //cout << "try to merge -----------------------------------------" << endl; + if (sibling != getNil()) + { + if (x->myPositions + sibling->myPositions < 2*logn) //merge ! + { + //cout << "merge with right sibling -----------------------------------------" << endl; + // move bits from sibling to x: + + (*sibling->block)<<=x->myPositions; + (*x->block)|=(*sibling->block); + + x->myPositions+=sibling->myPositions; + x->myRank+=sibling->myRank; + updateCountersOnPathToRoot(x); + rbDelete(sibling, callUpdateCountersOnPathToRoot); + delete sibling; + } + else sibling = getNil(); //to try the left sibling! + } + else if ((sibling= (BVNode*) treePredecessor(x)) != getNil()) + { + if (x->myPositions + sibling->myPositions < 2*logn) //merge ! + { + // move bits from sibling to x: (more complicated, needs shift!) + //cout << "merge with left sibling -----------------------------------------" << endl; + (*x->block)<<=sibling->myPositions; + x->myPositions+=sibling->myPositions; + + (*x->block)|=(*sibling->block); + + x->myRank+=sibling->myRank; + updateCountersOnPathToRoot(x); + rbDelete(sibling, callUpdateCountersOnPathToRoot); + delete sibling; + } + + + + } // end else if + } // end else if + + + +} + +void BVTree::writeTree(char *writefile){ + std::ofstream write(writefile); + if (!write) + { + cerr << "error: Error writing file: " << writefile<< "." << endl; + exit(EXIT_FAILURE); + } + writeTree(write); +} + + +ulong* BVTree::getBits(){ + BVNode *n = getRoot(); + ulong blockCounter=0; + ulong len = getPositions(); + + int W = sizeof(ulong)*8; + + + ulong bitsLength = len/W + ((len%W==0)?0:1); + + ulong *bits = new ulong[bitsLength]; + + ulong i=0; //0 .. bitsLength-1 + int j=0; //0 .. W-1 + + ulong mask_LeftBitSet = 1; + + mask_LeftBitSet <<= W-1; + + n=(BVNode*) treeMinimum(root); + while (n != getNil()) { + #ifndef NDEBUG + if (n->block == 0) { + cerr << "getBits(): block not found !!!" << endl; + exit(EXIT_FAILURE); + } + #endif + for (blockCounter=0; blockCounter < n->myPositions; blockCounter++) { + if (j == W) { + i++; + j=0; + } + bits[i] >>= 1; // set 0 + + if ((*n->block)[blockCounter]) bits[i] |= mask_LeftBitSet; // set 1 + j++; + } + n=(BVNode*) treeSuccessor(n); + } + + if (i != bitsLength-1) { + cout << "last index is wrong" << endl; + exit(EXIT_FAILURE); + } + while (j < W) { + bits[i] >>= 1; + j++; + } + + return bits; +} + +void BVTree::writeTree(ostream& stream){ + BVNode *x; + char c=0; + ulong cCounter=0; + ulong blockCounter=0; + + x = (BVNode*) treeMinimum(getRoot()); + + while (x != getNil()){ + blockCounter=x->myPositions; + while (blockCounter > 0) + { + while (blockCounter > 0 && cCounter < 8 ) + { + c<<=1; + if ((*x->block)[x->myPositions - blockCounter]) c++; + cCounter++; + blockCounter--; + } + if (cCounter == 8) + { + stream << c; + cCounter = 0; + } + } + + + x = (BVNode*) treeSuccessor(x); + } + + if (cCounter != 0) { + while (cCounter < 8 ) + { + c<<=1; // fill with zeros. + cCounter++; + } + stream << c; + } +} + + +void BVTree::appendBit(bool bit){ + ulong pos = 1; + if (root != getNil()) pos= getRoot()->subTreePositions + 1; + + insertBit(bit, pos); +} + +void BVTree::insertBit(bool bit, ulong i){ + if (getRoot() == getNil()) + { + BVNode *newNode; + newNode = new BVNode(getNil()); + newNode->color=BLACK; + bitset<2*logn> *newBlock; + newBlock=new bitset<2*logn>; + newNode->block = newBlock; + setRoot(newNode); + + } + + BVNode* x = getRoot(); + + #ifndef NDEBUG + if (i > x->subTreePositions+1) + { + printNode(x); + cerr << "error: insertBit: position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + if (i < 1) + { + cerr << "error: insertBit: position " << i <<" is not valid.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + + ulong lsP=0; + + + while (true) + { + // update subTree-counters + (x->subTreePositions)++; + if (bit) (x->subTreeRank)++; + + lsP = x->getLeft()->subTreePositions; + + if (lsP >= i) + {// cout << "A" << endl; + x=x->getLeft(); + } + else if (lsP+x->myPositions >= i-1){ //-1 to append to last position in current node + i-=lsP; + break; + } + else{ + i-=(lsP+x->myPositions); + x=x->getRight(); + } + } + + // now put the bit into the block x: + bitset<2*logn> mask; + + if (x->myPositions > 0) + { + if (i != 1) + { + mask.set(); + mask>>=2*logn - i +1; // make 1's at all positions < i + mask &= *(x->block); // store bits in front of new bit + (*(x->block)>>=i-1)<<=i; // shift bits by one + (*x->block)|=mask; // restore bits in front of new bit + } + else + (*x->block)<<=1; + } + + (*x->block)[i-1]=bit; // set new bit + + // update local pointers + (x->myPositions)++; //update p (leaf) + if (bit) (x->myRank)++; //update r (leaf) + + #ifndef NDEBUG + if ((int)x->myPositions > 2*logn) + { + cerr << "error: positions in block already too many.\n"; //shouldn't happen + exit(EXIT_FAILURE); + } + #endif + + // split and rotate, if necessary + if ((int)x->myPositions == 2*logn) + { + //cout << "split !-------------" << endl; + + // new node: + BVNode *newNode; + newNode = new BVNode(getNil()); + + //find place for new node: + BVNode *y;// some parent of new node + if (x->getRight() != getNil()) { + y = (BVNode*) this->treeMinimum(x->getRight()); + y->setLeft(newNode); + } else { + y=x; + y->setRight(newNode); + } + newNode->setParent(y); + + //new block: + bitset<2*logn> *newBlock; + newBlock=new bitset<2*logn>; + *newBlock=*x->block>>logn; //copy of bits into the new block + + mask.set(); + mask>>=logn; + *x->block &= mask; //delete bits that already have been copied to the new block + + newNode->block=newBlock; + newNode->myRank=(*newNode->block).count(); + newNode->myPositions=logn; + newNode->color=RED; + + //update old node x: + x->myRank-=newNode->myRank; //=(*x->block).count(); + x->myPositions=logn; + + updateCountersOnPathToRoot(newNode); // meets x + rbInsertFixup(newNode, callUpdateCounters); + + } // end if + + +} + +void BVTree::checkSubTree(BVNode *n){ + ulong lP = 0; + ulong lR = 0; + ulong rP = 0; + ulong rR = 0; + + if (n->getLeft()!=getNil()) { + lP = n->getLeft()->subTreePositions; + lR = n->getLeft()->subTreeRank; + if (n->getLeft()->getParent() != n) { + cout << "au"<< endl; + exit(1); + } + + } + + if (n->getRight()!=getNil()) { + rP = n->getRight()->subTreePositions; + rR = n->getRight()->subTreeRank; + if (n->getRight()->getParent() != n) { + cout << "au"<< endl; + exit(1); + } + } + + if ( (n->subTreePositions != (n->myPositions + lP + rP)) || + (n->subTreeRank != (n->myRank + lR + rR)) ){ + cout << "checkSubTree: error" <myPositions + lP + rP: " << n->myPositions + lP + rP << endl; + cout << "n->myRank + lR + rR: " << n->myRank + lR + rR << endl; + printNode(n); + printNode(n->getLeft()); + printNode(n->getRight()); + exit(1); + } + + if (n->getLeft()!=getNil()) checkSubTree(n->getLeft()); + if (n->getRight()!=getNil()) checkSubTree(n->getRight()); +} + +void callUpdateCounters(RBNode *n, RBTree *T){ + ((BVTree*)T)->updateCounters((BVNode*) n); +} + +void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){ + ((BVTree*)T)->updateCountersOnPathToRoot((BVNode*) n); +} + diff --git a/bittree.h b/bittree.h new file mode 100644 index 0000000..5344f74 --- /dev/null +++ b/bittree.h @@ -0,0 +1,227 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +// Implementation of the Dynamic Bit Vector with Indels problem +// space: O(n) time: O(log(n)) +// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text +// Indexes. CPM 2006, Chapter 3.4 Dynamic Structures for Bit Vectors +// also: W.-L. Chan, W.-K. Hon, and T.-W. Lam. Compressed index for a dynamic collection +// of texts. In Proc. CPM04, LNCS 3109, pages 445-456, 2004 + +#ifndef BVTree +#define BVTree BVTree + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "rbtree.h" + + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + + +#ifndef LOGN +#define LOGN 64 +#endif + +const int logn = LOGN; + +//upperBound = 2 * logn; +//lowerBound = logn / 2; + + +class BVNode; +class BVTree; + +void callUpdateCounters(RBNode *n, RBTree *T); +void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T); + +class BVNode : public RBNode +{ + public: + ulong myPositions; // 4*4 bytes = 16 bytes + ulong myRank; + ulong subTreePositions; //number of positions stored in the subtree rooted at this node + ulong subTreeRank; //number of bits set in the subtree rooted at this node + + std::bitset<2*logn> *block; // 4 bytes + + + BVNode() + : RBNode(this), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) { + } + + + BVNode(BVNode* n) + : RBNode(n), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) { + } + + ~BVNode(){ + delete block; + block = 0; + } + + + BVNode* getParent(){ + return ((BVNode*) ((RBNode*) this)->parent); + } + + BVNode* getLeft(){ + return ((BVNode*) ((RBNode*) this)->left); + } + + BVNode* getRight(){ + return ((BVNode*) ((RBNode*) this)->right); + } + + void setParent(BVNode* n){ + ((RBNode*) this)->parent=(RBNode*)n; + } + + void setLeft(BVNode* n){ + ((RBNode*) this)->left=(RBNode*)n; + } + + void setRight(BVNode* n){ + ((RBNode*) this)->right=(RBNode*)n; + } + +}; + + +class BVTree : public RBTree{ +public: + + //Constructors + BVTree(){ + + setNil(new BVNode()); + setRoot(getNil()); + + tempnil = getNil(); + + tempbit = new std::bitset<2*logn>; + } + + //Destructor: + ~BVTree(); + + bool operator[](ulong); + + + // inserts bit at position i, countings begins with 1: + void appendBit(bool bit); + void insertBit(bool bit, ulong i); + void deleteBit(ulong i); + + ulong rank0(ulong i); + ulong rank1(ulong i); + ulong rank(bool b, ulong i){return b?rank1(i):rank0(i);} + + ulong select0(ulong i); + ulong select1(ulong i); + ulong select(bool b, ulong i){return b?select1(i):select0(i);} + + void setRoot(BVNode* n){ + ((RBTree*) this)->root=(RBNode*)n; + } + + BVNode* getRoot(){ + return ((BVNode*) ((RBTree*) this)->root); + } + + void setNil(BVNode* n){ + tempnil = n; + ((RBTree*) this)->nil=(RBNode*)n; + } + + BVNode* getNil(){ + return tempnil; + + } + + // write bits back into a stream: + ulong* getBits(); + void writeTree(char *writefile); + void writeTree(std::ostream& stream); //e.g. stream = cout + + int getTreeMaxDepth(); + int getTreeMinDepth(); + ulong getPositions(); + ulong getRank(); + + void iterateReset(); + bool iterateGetBit(); + bool iterateNext(); + ulong iterateGetRank(bool bit); + + bool getLastBitDeleted(){return lastBitDeleted;} + ulong getLastRank(){return lastRank;} + + void checkSubTree(BVNode *n); + + void updateCounters(BVNode *n); + void updateCountersOnPathToRoot(BVNode *walk); + + //debug: + void printNode(ulong i); + +protected: + + ulong iterate; + ulong iterateLocal; + ulong iterateRank; + BVNode *iterateNode; + + BVNode *tempnil; + + bool lastBitDeleted; + ulong lastRank; + + + std::bitset<2*logn> *tempbit; + + // content of BVNode, for debugging: + void printNode(BVNode *n); + + // other operations: + ulong getLocalRank(BVNode* n, ulong position); + ulong getLocalSelect0(BVNode* n, ulong query); + ulong getLocalSelect1(BVNode* n, ulong query); + + void deleteNode(BVNode *n); + void deleteLeaf(BVNode *leaf); +}; + + +#endif + diff --git a/dependencies.mk b/dependencies.mk new file mode 100644 index 0000000..4e40bc8 --- /dev/null +++ b/dependencies.mk @@ -0,0 +1,12 @@ +BitRank.o: BitRank.cpp BitRank.h Tools.h +CSA.o: CSA.cpp CSA.h BitRank.h Tools.h dynFMI.h bittree.h rbtree.h \ + handle.h pos.h +TextCollection.o: TextCollection.cpp TextCollection.h Tools.h +Tools.o: Tools.cpp Tools.h +bittree.o: bittree.cpp bittree.h rbtree.h +dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h handle.h pos.h +handle.o: handle.cpp handle.h rbtree.h pos.h +pos.o: pos.cpp pos.h rbtree.h handle.h +rbtree.o: rbtree.cpp rbtree.h +testTextCollection.o: testTextCollection.cpp CSA.h BitRank.h Tools.h \ + dynFMI.h bittree.h rbtree.h handle.h pos.h TextCollection.h diff --git a/dynFMI.cpp b/dynFMI.cpp new file mode 100644 index 0000000..dd4e989 --- /dev/null +++ b/dynFMI.cpp @@ -0,0 +1,807 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + + +#include +#include +#include +#include +#include +#include +#include + +#include "dynFMI.h" + +using std::cerr; +using std::cout; +using std::endl; +using std::ios; +using std::priority_queue; +using std::vector; + + + +// -------- DynFMI -------- + +DynFMI::~DynFMI(){ + deleteDynFMINodes(root); + root = 0; + delete[] leaves; //free(leaves) // ???; +#if SAMPLE!=0 + delete SampledSAPositionsIndicator; +#endif + delete handle; + delete pos; +#if SAMPLE!=0 + delete sampledSATree; +#endif +} + +void DynFMI::deleteDynFMINodes(WaveletNode *n){ + if (n->right) deleteDynFMINodes(n->right); + if (n->left) deleteDynFMINodes(n->left); + delete n; +} + +DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){ + initEmptyDynFMI(text, x, n); + if (del) delete [] text; +} + +DynFMI::DynFMI(char *readfile){ + + std::ifstream file(readfile); + if (!file) + { + cerr << "error reading file: " << readfile << endl; + exit(EXIT_FAILURE); + } + + file.seekg(0, ios::end); + ulong n = file.tellg(); + file.seekg(0, ios::beg); + + uchar *text = new uchar[n]; + + char c; + ulong i=0; + + + while (file.get(c)) + { + text[i]=c; + i++; + } + file.close(); + + initEmptyDynFMI(text, 1, n); + + + delete text; + +} + +void DynFMI::iterateReset(){ + iterate = 1; + recursiveIterateResetWaveletNode(root); +} + +void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){ + w->bittree->iterateReset(); + + if (w->left) recursiveIterateResetWaveletNode(w->left); + if (w->right) recursiveIterateResetWaveletNode(w->right); +} + + +bool DynFMI::iterateNext(){ + iterate++; + return !(iterate > getSize()); +} + +uchar DynFMI::iterateGetSymbol(){ + + ulong i = iterate; + WaveletNode *walk = root; + bool bit; + + while (true) { + + bit = walk->bittree->iterateGetBit(); // TODO improve + i=walk->bittree->iterateGetRank(bit); + + walk->bittree->iterateNext(); + + + if (bit) { //bit = 1 + if (walk->right == 0) return walk->c1; + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) return walk->c0; + walk=walk->left; + } + + + } // end of while + +} + + +uchar* DynFMI::getBWT(){ + ulong n = root->bittree->getPositions(); + n++; + uchar *text = new uchar[n]; + + bool data=true; + // old slow version: + //for (ulong i=1; i <= root->bittree->getPositions(); i++) + // text[i-1]=(*this)[i]; + + ulong i = 0; + + iterateReset(); + + while (data) { + text[i] = iterateGetSymbol(); + data = iterateNext(); + i++; + } + text[n-1]=0; + + return text; +} + +std::ostream& DynFMI::getBWTStream(std::ostream& stream){ + ulong n = root->bittree->getPositions(); + n++; + //uchar *text = new uchar[n]; + + bool data=true; + // old slow version: + //for (ulong i=1; i <= root->bittree->getPositions(); i++) + // text[i-1]=(*this)[i]; + + ulong i = 0; + + iterateReset(); + + while (data) { + //text[i] = iterateGetSymbol(); + stream << iterateGetSymbol(); + data = iterateNext(); + i++; + } + //text[n-1]=0; + + return stream; +} + +//void DynFMI::printDynFMIContent(ostream& stream){ +// uchar c; +// for (ulong i=1; i<=getSize(); i++) +// { +// c =(*this)[i]; +// if (c==0) c= '#'; +// stream << c; +// } + + +void DynFMI::deleteLeaves(WaveletNode *node){ + bool leaf = true; + + if (node->left) { + // internal node + leaf = false; + deleteLeaves(node->left); + + } + if (node->right){ + leaf = false; + deleteLeaves(node->right); + } + + if (leaf) { + // is a leaf, delete it! + if (node->parent) { + if (node==node->parent->left) node->parent->left=0; + else node->parent->right=0; + } + delete node; + } +} + +void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){ + #ifndef NDEBUG + if (node == node->left) { + cout << "makeCodes: autsch" << endl; + exit(0); + } + #endif + + if (node->left) { + makeCodes(code | (0 << bits), bits+1, node->left); + makeCodes(code | (1 << bits), bits+1, node->right); + } else { + codes[node->c0] = code; + codelengths[node->c0] = bits+1; + } +} + +void DynFMI::appendBVTrees(WaveletNode *node){ + node->bittree = new BVTree(); + + if (node->left) appendBVTrees(node->left); + if (node->right) appendBVTrees(node->right); +} + +void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){ + // pointers to the leaves for select + leaves = (WaveletNode**) new WaveletNode*[256]; + for(int j=0; j<256; j++) leaves[j]=0; + + + ulong i; + for(i=0; i < n; i++) { + if (text[i]==0) { + cerr << "Error: unsigned char 0 is reserved." << endl; + exit(EXIT_FAILURE); + } + + if (leaves[text[i]]==0) { + leaves[text[i]] = new WaveletNode(text[i]); + } + leaves[text[i]]->weight++; + } + + // separation symbol: + leaves[0] = new WaveletNode((uchar)0); + leaves[0]->weight=x; + + // Veli's approach: + priority_queue< WaveletNode*, vector, std::greater > q; + + + for(int j=255; j>=0; j--){ + if (leaves[j]!=0) { + q.push( (leaves[j]) ); + } + codes[j] = 0; + codelengths[j] = 0; + } + + // creates huffman shape: + while (q.size() > 1) { + + WaveletNode *left = q.top(); + q.pop(); + + WaveletNode *right = q.top(); + q.pop(); + + q.push( new WaveletNode(left, right) ); + } + + + root = q.top(); + q.pop(); + + + makeCodes(0,0, root); // writes codes and codelengths + + + // merge leaves (one leaf represent two characters!) + for(int j=0; j<256; j++){ + + if (leaves[j]) { + + if (leaves[j]->parent->left==leaves[j]) { + leaves[j]->parent->c0=j; + } else { + leaves[j]->parent->c1=j; + } + leaves[j]=leaves[j]->parent; // merge + } + } + + deleteLeaves(root); + appendBVTrees(root); + + // array C needed for backwards search + for(int j=0; j<256+256; j++) C[j] = 0; + +#if SAMPLE!=0 + this->SampledSAPositionsIndicator = new BVTree(); + this->sampledSATree = new SampledSATree(); +#endif + + this->pos = new Pos(sampleInterval); + this->handle = new Handle(); + + pos->handle=this->handle; + handle->pos=this->pos; + +} + +void DynFMI::append(uchar *text, ulong length){ + ulong i; + ulong start = root->bittree->getPositions(); + for (i = start; ibittree->getPositions()+1); +} + +void DynFMI::insert(uchar c, ulong i){ + #ifndef NDEBUG + if (codelengths[c]==0) { + cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;; + exit(EXIT_FAILURE); + } + #endif + + ulong level = 0; + ulong code = codes[c]; + + bool bit; + WaveletNode *walk = root; + + while (walk) { + + bit = ((code & (1u << level)) != 0); + + walk->bittree->insertBit(bit,i); // TODO improve + i=walk->bittree->rank(bit, i); + + if (bit) { //bit = 1 + walk=walk->right; + } else { // bit = 0 + walk=walk->left; + } + + level++; + } // end of while + + int j = 256+c; + while(j>1) { + C[j]++; + j=binaryTree_parent(j); + } + C[j]++; + +} + +void DynFMI::deleteSymbol(ulong i){ + WaveletNode *walk = root; + bool bit; + uchar c; + while (true) { + + // original slow version: + //bit = (*walk->bittree)[i]; + //old_i = i; + //i=walk->bittree->rank(bit, i); + //walk->bittree->deleteBit(old_i); + + + walk->bittree->deleteBit(i); + bit=walk->bittree->getLastBitDeleted(); + i=walk->bittree->getLastRank(); + + if (bit) { //bit = 1 + if (walk->right == 0) { + c = walk->c1; + break; + } + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) { + c = walk->c0; + break; + } + walk=walk->left; + } + + + } // end of while + + int j = 256+c; + while(j>1) { + C[j]--; + j=binaryTree_parent(j); + } + C[j]--; + + +} + + +uchar DynFMI::operator[](ulong i){ + WaveletNode *walk = root; + bool bit; + + while (true) { + + bit = (*walk->bittree)[i]; //TODO improve by reducing + i=walk->bittree->rank(bit, i); + + if (bit) { //bit = 1 + if (walk->right == 0) return walk->c1; + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) return walk->c0; + walk=walk->left; + } + + + } // end of while + cout << endl; +} + +ulong DynFMI::rank(uchar c, ulong i){ + if (i==0) return 0; + + ulong level = 0; + ulong code = codes[c]; + + + bool bit; + WaveletNode *walk = root; + + while (true) { + + bit = ((code & (1u << level)) != 0); + + i=walk->bittree->rank(bit, i); + if (bit) { //bit = 1 + if (walk->right == 0) return i; + walk=walk->right; + } else { // bit = 0 + if (walk->left == 0) return i; + walk=walk->left; + } + + level++; + } // end of while + + cerr << "error: DynFMI::rank: left while loop" << endl; + exit(EXIT_FAILURE); + return 0; //never +} + +ulong DynFMI::select(uchar c, ulong i){ + + WaveletNode *walk = leaves[c]; + + bool bit = (walk->c1==c); + + while (walk->parent) { + i=walk->bittree->select(bit, i); + + bit = (walk == walk->parent->right); + walk=walk->parent; + } // end of while + + i=walk->bittree->select(bit, i); + + return i; +} + + +ulong DynFMI::count(uchar *pattern){ + + ulong i = strlen((char*)pattern); + ulong sp = 1; + + ulong ep = getSize(); + uchar s; + while ((sp <= ep) && (i >=1)) { + s=pattern[i-1]; + sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1; + ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep); + + i--; + } + return ep-sp +1; +} + +#if SAMPLE!=0 +ulong* DynFMI::locate(uchar *pattern){ + + ulong i = strlen((char*)pattern); + ulong sp = 1; + + ulong ep = getSize(); + uchar s; + while ((sp <= ep) && (i >=1)) { + s=pattern[i-1]; + sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1; + ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep); + + i--; + } + + ulong numberOfMatches = ep-sp +1; + ulong *matches = new ulong[2*numberOfMatches + 1]; + matches[0] = numberOfMatches; + + ulong diff; + ulong k = 1; + cout << "----------------------" << endl; + for (ulong j=sp; j <= ep; j++) { + + i=j; + diff = 0; + while (!(*SampledSAPositionsIndicator)[i]) { + diff++; + s=(*this)[i]; + i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i); + + } + + + sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i)); + + //cout << "found: (" << sampledSATree->handle << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl; + matches[k++] = sampledSATree->handle; + matches[k++] = sampledSATree->sampledSAValue + diff; + } + + return matches; +} +#endif + +// size must include endmarker! +ulong DynFMI::addText(uchar const * str, ulong n){ + ulong i; +#if SAMPLE!=0 + bool sample = false; +#endif + i=pos->getSize()+1; + + ulong key = pos->appendText(n); + + // Adding an empty text (end-marker) + if (n == 1) + { + insert(str[0], i); + //if (del) delete [] str; + return key; + } + + insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1 +#if SAMPLE!=0 + sample = (n-1 % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + for (ulong t=n-2; t > 0; t--) { + i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i); + insert(str[t-1],i); +#if SAMPLE!=0 + sample = ((t) % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + + if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + } + + + i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i); + insert(str[n-1],i); +#if SAMPLE!=0 + SampledSAPositionsIndicator->insertBit(true, i); + sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif +// if (del) delete [] str; + + return key; +} + + +ulong DynFMI::addTextFromFile(char* filename, ulong n){ + ulong i; +#if SAMPLE!=0 + bool sample = false; +#endif + i=pos->getSize()+1; + + + std::ifstream str(filename, ios::binary); + if (!str) + { + cerr << "error reading file: " << filename << endl; + exit(EXIT_FAILURE); + } + + char c; + //char oldc; + + + ulong buf_len; //1048576; //10MB + char * buffer; + ulong buf_pos; + + if (BUFFER==0) buf_len = n; + else buf_len = BUFFER; + + buffer = new char[buf_len]; + buf_pos = (n-2) / buf_len; + str.seekg(buf_pos*buf_len); + str.read(buffer, buf_len); + str.clear(std::ios_base::goodbit); + + + ulong key = pos->appendText(n); + + //str.seekg(n-2); + + //c=str.get(); + c=buffer[(n-2)%buf_len]; + insert(c,i); // insert second last character, corresponds to suffix of length 1 +#if SAMPLE!=0 + sample = (n-1 % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + if ((n-2)%buf_len == 0) { + c=buffer[0]; + buf_pos--; + str.seekg(buf_pos*buf_len); + str.read(buffer, buf_len); + } + for (ulong t=n-2; t > 0; t--) { + + i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i); + + c=buffer[(t-1)%buf_len]; + insert(c,i); + + //check buffer + if (((t-1)%buf_len == 0) && ((t-1)!=0)) { +#ifndef NDEBUG + if (buf_pos == 0) { + cerr << "buf_pos too small" << endl; + exit(0); + } +#endif + buf_pos--; + str.seekg(buf_pos*buf_len); + str.read(buffer, buf_len); + } + +#if SAMPLE!=0 + sample = ((t) % sampleInterval == 0); + SampledSAPositionsIndicator->insertBit(sample, i); + + if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + } + + i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); + insert(c,i); + +#if SAMPLE!=0 + SampledSAPositionsIndicator->insertBit(true, i); + sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i)); +#endif + + str.close(); + + return key; +} + + +uchar* DynFMI::retrieveText(ulong key){ + cout << "key: " << key << endl; + uchar *text=0; + // TODO access two times, too much + ulong i=handle->getPos(key); //=bwtEnd; + if (i == 0 ) { + return text; + } + + ulong n=pos->getTextSize(i); + + text = new uchar[n]; // last byte 0 for cout + + uchar c; + +//TODO better + for (ulong t=n-2; t < n; t--) { + c = (*this)[i]; + text[t]=(c==0?'#':c); + + i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); + } + + #ifndef NDEBUG + for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-'; + #endif + + text[n-1] = 0; + return text; +} + + +bool DynFMI::deleteText(ulong key){ + // TODO access two times, can do better + ulong i=handle->getPos(key); //=bwtEnd; + if (i == 0) return false; + + ulong n=pos->getTextSize(i); + ulong *text = new ulong[n]; + uchar c; + + for (ulong t=n-1; t deleteSymbol(text[i]); +#if SAMPLE!=0 + SampledSAPositionsIndicator->deleteBit(text[i]); +//TODO samples ? +#endif + } + delete[] text; + handle->deleteKey(key); + + + return true; +} + +ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){ + int j = 256+c; + ulong r=0; + while(j>1) { + if (binaryTree_isRightChild(j)) + r += C[binaryTree_left(binaryTree_parent(j))]; + + j=binaryTree_parent(j); + } + return r; +} + + + + +void DynFMI::printDynFMIContent(std::ostream& stream){ + uchar c; + for (ulong i=1; i<=getSize(); i++) + { + c =(*this)[i]; + if (c==0) c= '#'; + stream << c; + } +} + + + + diff --git a/dynFMI.h b/dynFMI.h new file mode 100644 index 0000000..41ef992 --- /dev/null +++ b/dynFMI.h @@ -0,0 +1,233 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +// ------ Dynamic Sequence with Indels ----- +// uses huffman shaped wavelet tree +// space: O(nH(0)) operation time: O(log n log \sigma) +// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text +// Indexes. CPM 2006, Chapter 3.6 + + +#include +#include +#include +#include +#include + +#ifndef SAMPLE +//#define SAMPLE 1024 +#define SAMPLE 0 // No sampling. +#endif + +#ifndef BUFFER +#define BUFFER 1048576 +#endif + + +#include "bittree.h" +#include "handle.h" +#include "pos.h" + +#if SAMPLE!=0 +#include "ssatree.h" +#endif + + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + + +const ulong sampleInterval = SAMPLE; + +class WaveletNode{ + public: + + WaveletNode *left; + WaveletNode *right; + WaveletNode *parent; + ulong weight; // used only while construction + uchar c0; // used also while construction + uchar c1; + + BVTree *bittree; + + WaveletNode(uchar c) + : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){} + + WaveletNode(WaveletNode *left, WaveletNode *right) + : left(left), right(right), parent(0), bittree(0) { + weight = left->weight + right->weight; + left->parent = this; + right->parent = this; + } + + ~WaveletNode(){ + delete bittree; + bittree = 0; + } + + bool operator>(const WaveletNode &a) const { + return (weight > a.weight); + } + + +}; + +namespace std +{ + +template<> struct greater + { + bool operator()(WaveletNode const* p1, WaveletNode const* p2) + { + if(!p1) + return false; + if(!p2) + return true; + return *p1 > *p2; + } + }; +} + + +class DynFMI{ + public: + + // read char distribution from string + // x=expected number of texts, n=length of text, del=delete text afterwards + DynFMI(uchar *text, ulong x, ulong n, bool del); + // read char distribution from file + DynFMI(char *readfile); + + ~DynFMI(); + + + + //n=length of str, del=delete text afterwards + ulong addText(uchar const * str, ulong n); + ulong addTextFromFile(char* filename, ulong n); + uchar* retrieveText(ulong text); + bool deleteText(ulong key); + + ulong count(uchar *pattern); +#if SAMPLE!=0 + ulong* locate(uchar *pattern); +#endif + + //LF(i)-mapping: C[L[i]]+rank_L[i](L,i) + ulong LFmapping(ulong i) { + uchar s=(*this)[i]; + return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i); + } + + + ulong getSize() {return root->bittree->getPositions();} + ulong getCollectionSize(){ return pos->getSize();} + uchar* getBWT(); + std::ostream& getBWTStream(std::ostream& stream); + + uchar operator[](ulong i); + void printDynFMIContent(std::ostream& stream); + ulong* getKeys() {return handle->getKeys(); } + ulong getPos(ulong i) + { return handle->getPos(i); } + + private: + WaveletNode *root; + WaveletNode **leaves; // needed for construction and select + + ulong codes[256]; + int codelengths[256]; + ulong C[256+256]; + +#if SAMPLE!=0 + BVTree *SampledSAPositionsIndicator; +#endif + ulong iterate; + + Handle *handle; + Pos *pos; +#if SAMPLE!=0 + SampledSATree *sampledSATree; +#endif + + + ulong rank(uchar c, ulong i); + ulong select(uchar c, ulong i); + + void insert(uchar c, ulong i); + void append(uchar c); + void append(uchar *text, ulong length); + void deleteSymbol(ulong i); + + + + // functions + ulong getNumberOfSymbolsSmallerThan(uchar c); + + void initEmptyDynFMI(uchar *text, ulong x, ulong n); + + void makeCodes(ulong code, int bits, WaveletNode *node); + void deleteLeaves(WaveletNode *node); + void appendBVTrees(WaveletNode *node); + + void deleteDynFMINodes(WaveletNode *n); + + //Iterator (public??) + void iterateReset(); + bool iterateNext(); + uchar iterateGetSymbol(); + void recursiveIterateResetWaveletNode(WaveletNode *w); + + + // small help functions + double log2(double x){ + return (log10(x) / log10((double)2)); + } + + int binaryTree_parent(int i){ + return (int)floor((double)i/2); + } + + int binaryTree_left(int i){ + return 2*i; + } + + int binaryTree_right(int i){ + return 2*i + 1; + } + + bool binaryTree_isLeftChild(int i){ + return (i%2==(int)0); + } + + bool binaryTree_isRightChild(int i){ + return (i%2==(int)1); + } + +}; + + + + diff --git a/handle.cpp b/handle.cpp new file mode 100644 index 0000000..9568dd4 --- /dev/null +++ b/handle.cpp @@ -0,0 +1,288 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +#include +#include + +#include "handle.h" + +using std::cerr; +using std::cout; +using std::endl; + +ulong Handle::getPos(ulong key){ + HandleNode *n = getKey(key); + if (n==0) return 0; + + PosNode *p = n->posNode; + + #ifndef NDEBUG + if (p==pos->getNil()) { + cerr << "error: Handle::getPos: no PosNode found." << endl; + exit(EXIT_FAILURE); + } + #endif + return (this->pos->getPos(p)); +} + +ulong* Handle::getEndPositions(){ + ulong num = getSize(); + ulong i = 0; + + if (this->root == getNil()) return 0; + + ulong *positions= new ulong[num]; + + + HandleNode *n = (HandleNode*) treeMinimum(root); + while (n != getNil()) { + + //printf("node: %d", n); + positions[i] = this->pos->getPos(n->posNode); + i++; + n = (HandleNode*) treeSuccessor((RBNode*) n); + } + return positions; +} + + +ulong* Handle::getKeys(){ + ulong num = getSize(); + ulong i = 0; + + if (this->root == getNil()) return 0; + + ulong *keys= new ulong[num]; + + + HandleNode *n = (HandleNode*) treeMinimum(root); + #ifndef NDEBUG + HandleNode *oldn; + #endif + while (n != getNil()) { + + //printf("node: %d", n); + keys[i] = n->key; + i++; + #ifndef NDEBUG + oldn = n; + #endif + n = (HandleNode*) treeSuccessor((RBNode*) n); + #ifndef NDEBUG + if (n == oldn) { + cerr << "old!" << endl; + exit(EXIT_FAILURE); + } + #endif + } + #ifndef NDEBUG + if (num != i) { + cerr << "num: " << num << endl; + cerr << "i: " << i << endl; + cerr << "error: Handle::getKeys: key number was not correct." << endl; + exit(EXIT_FAILURE); + } + #endif + return keys; +} + +void Handle::deleteKey(ulong key){ + + HandleNode *n = getKey(key); + this->pos->deletePosNode(n->posNode); + + rbDelete( n, callHandleUpdateCountersOnPathToRoot); + delete n; +} + + + + +HandleNode* Handle::getKey(ulong key){ + HandleNode *n= getRoot(); + while (n != getNil()) { + if (n->key == key) return n; + if (n->getLeft() != getNil()) { + if (key <= n->getLeft()->maxkey) n=n->getLeft(); + else n=n->getRight(); + } + else n=n->getRight(); + } + + + return 0; +} + +void Handle::updateCountersOnPathToRoot(HandleNode *n) { + while (n != getNil()) { + updateCounters(n); + + n = n->getParent(); + } +} + + +void Handle::updateCounters(HandleNode *n) { + #ifndef NDEBUG + if (n == getNil()) { + cerr << "error: Handle::updateCounters" << endl; + exit(EXIT_FAILURE); + } + #endif + n->subtreeSize=1; + + + n->subtreeSize += n->getLeft()->subtreeSize; + + + if ( n->getRight() != getNil()) { + n->subtreeSize += n->getRight()->subtreeSize; + n->maxkey=n->getRight()->maxkey; + } else n->maxkey=n->key; + +} + + +HandleNode* Handle::getNewKey(){ + HandleNode *n= getRoot(); + + if (n == getNil()) { + //tree is empty + HandleNode *newLeaf= new HandleNode(getNil(),1); // 1=smallest key, 0 = error + setRoot(newLeaf); + ((RBNode*)newLeaf)->color=BLACK; + + return newLeaf; + } + + if (n->maxkey == n->subtreeSize) { + // tree is full + HandleNode *last = (HandleNode*) treeMaximum(n); + + + HandleNode *newLeaf= new HandleNode(getNil(), n->maxkey+1); + + last->setRight(newLeaf); + newLeaf->setParent(last); + + if (newLeaf->getParent() != getNil()) updateCountersOnPathToRoot(newLeaf->getParent()); + + rbInsertFixup(newLeaf, callHandleUpdateCounters); + + return newLeaf; + } + + HandleNode *newNode; + ulong smallerkeys = 0; + ulong lmax; //getLeft()->maxkey + ulong lsub; //n->getLeft()->subtreeSize + while (true) { + cout << "search first free key" << endl; + // search first free key + + lmax = n->getLeft()->maxkey; + lsub = n->getLeft()->subtreeSize; + + + if (lmax == 0) { // no left child + if (smallerkeys+1 < n->key) { // check if it is free + newNode= new HandleNode(getNil(), smallerkeys+1); + newNode->setParent(n); + n->setLeft(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + } + } else { //left child exists + if ( lmax > (lsub + smallerkeys) ) { + // free key at left subtree + n=n->getLeft(); + continue; + } else if (lmax + 1 < n->key) { // full left subtree, check if it is free inbetween + // insert to predecessor + n=(HandleNode*)treePredecessor(n); + //found place -> :predeccessor new key = lmax + 1 + newNode= new HandleNode(getNil(), lmax+1); + newNode->setParent(n); + n->setRight(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + + } + } + + smallerkeys += 1+lsub; + + if (n->getRight() == getNil()) { // no right child, must be free + newNode= new HandleNode(getNil(), smallerkeys+1); + newNode->setParent(n); + n->setRight(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + } else { //right child exists + + //n=n->getRight(); + if (n->getRight()->maxkey-smallerkeys == n->getRight()->subtreeSize) { // right subTree is full, insert in front + ulong leftMinKey = n->getRight()->maxkey - n->getRight()->subtreeSize; + if (leftMinKey -1 > n->key) { //in front there is space + //insert new n-key + 1 + newNode= new HandleNode(getNil(), n->key+1); + n=(HandleNode*)treeSuccessor(n); + newNode->setParent(n); + n->setLeft(newNode); + updateCountersOnPathToRoot(n); + rbInsertFixup(newNode, callHandleUpdateCounters); + return newNode; + } else { + cerr << "error: Handle::getNewKey: no space ?!? " << endl; + exit(EXIT_FAILURE); + } + } else { + n=n->getRight(); + } + + + } + + #ifndef NDEBUG + if (n==getNil()) { + cerr << "error: Handle::getNewKey: (A) something wrong ! " << endl; + exit(EXIT_FAILURE); + } + #endif + } + #ifndef NDEBUG + cerr << "error: Handle::getNewKey: (B) something wrong ! " << endl; + exit(EXIT_FAILURE); + #endif + return 0; // error +} + +void callHandleUpdateCounters(RBNode *n, RBTree *T){ + ((Handle *)T)->updateCounters((HandleNode*) n); +} + +void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){ + ((Handle *)T)->updateCountersOnPathToRoot((HandleNode*) n); +} + diff --git a/handle.h b/handle.h new file mode 100644 index 0000000..dbe2fc4 --- /dev/null +++ b/handle.h @@ -0,0 +1,134 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +#ifndef Handle +#define Handle Handle + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +#include "rbtree.h" +#include "pos.h" + + +class Pos; +class PosNode; + +void callHandleUpdateCounters(RBNode *n, RBTree *T); +void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T); + + +class HandleNode : public RBNode { + public: + ulong key; //maximum for internal nodes + ulong subtreeSize; + ulong maxkey; + PosNode *posNode; + + HandleNode() + : RBNode(this), key(0), subtreeSize(0), maxkey(0){ + } + + + HandleNode(HandleNode *n, ulong key) + : RBNode(n), key(key), subtreeSize(1), maxkey(key){ + } + + + HandleNode* getParent(){ + return ((HandleNode*) ((RBNode*) this)->parent); + } + + HandleNode* getLeft(){ + return ((HandleNode*) ((RBNode*) this)->left); + } + + HandleNode* getRight(){ + return ((HandleNode*) ((RBNode*) this)->right); + } + + void setParent(HandleNode* n){ + ((RBNode*) this)->parent=(RBNode*)n; + } + + void setLeft(HandleNode* n){ + ((RBNode*) this)->left=(RBNode*)n; + } + + void setRight(HandleNode* n){ + ((RBNode*) this)->right=(RBNode*)n; + } + +}; + +class Handle : public RBTree{ + public: + Pos *pos; + + Handle(){ + setNil(new HandleNode()); + setRoot(getNil()); + } + + + void setRoot(HandleNode* n){ + ((RBTree*) this)->root=(RBNode*)n; + } + + HandleNode* getRoot(){ + return ((HandleNode*) ((RBTree*) this)->root); + } + + void setNil(HandleNode* n){ + ((RBTree*) this)->nil=(RBNode*)n; + } + + HandleNode* getNil(){ + return ((HandleNode*) ((RBTree*) this)->nil); + } + + + ulong getSize(){ + return (getRoot()!=getNil())?getRoot()->subtreeSize:0; + } + + ulong* getKeys(); + + ulong getPos(ulong key); + ulong* getEndPositions(); + + HandleNode* getKey(ulong key); + void deleteKey(ulong key); + void updateCountersOnPathToRoot(HandleNode *n); + void updateCounters(HandleNode *n); + + HandleNode* getNewKey(); + + + +}; + +#endif diff --git a/makefile b/makefile new file mode 100644 index 0000000..0d90342 --- /dev/null +++ b/makefile @@ -0,0 +1,15 @@ +CC = g++ +CPPFLAGS = -Wall -ansi -pedantic -g + +testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o handle.o pos.o rbtree.o dynFMI.o + +testTextCollection: $(testTextCollection_obs) + $(CC) -o testTextCollection $(testTextCollection_obs) + +clean: + rm -f core *.o *~ testTextCollection + +depend: + $(CC) -MM *.cpp *.c > dependencies.mk + +include dependencies.mk diff --git a/pos.cpp b/pos.cpp new file mode 100644 index 0000000..45f2b15 --- /dev/null +++ b/pos.cpp @@ -0,0 +1,170 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modquity * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + +#include +#include + +#include "pos.h" + +using std::cerr; +using std::endl; + +ulong Pos::getPos(PosNode *n){ + ulong position=0; + + if (n->getLeft() != getNil()) position += n->getLeft()->subtreeSize; + + while (n->getParent() != getNil()) { + if (n == n->getParent()->getRight()) { + // rightchild + if (n->getParent()->getLeft() != getNil()) position += n->getParent()->getLeft()->subtreeSize; + position++; + } + + n=n->getParent(); + } + + + return position+1; +} + +ulong Pos::getTextSize(ulong pos){ + PosNode *n= getPosNode(pos); + + return n->textSize; +} + +void Pos::deleteText(ulong pos){ + PosNode *n= getPosNode(pos); + + // current node matches. + rbDelete(n, callPosUpdateCountersOnPathToRoot); + delete n; +} + +void Pos::deletePosNode(PosNode *n){ + rbDelete(n, callPosUpdateCountersOnPathToRoot); + delete n; +} + +PosNode* Pos::getPosNode(ulong pos){ + // smallest pos is 1 ! + pos--; + PosNode *n= getRoot(); + ulong leftTree=0; + ulong leftSubTreeSize; + while (true) { + leftSubTreeSize = n->getLeft()->subtreeSize; + + if (pos == leftTree + leftSubTreeSize ) { + // current node matches. + return n; + } else if (pos < leftTree + leftSubTreeSize){ + n=n->getLeft(); + } + else { + leftTree += leftSubTreeSize +1; + n=n->getRight(); + } + } + + + cerr << "error: Pos::getPosNode: text POS " << pos << " not found!" << endl; + exit(EXIT_FAILURE); + + return getNil(); +} + + +ulong Pos::appendText(ulong textSize){ + PosNode *n= getRoot(); + + if (n == getNil()) { + PosNode *newLeaf= new PosNode(getNil(), textSize); + + setRoot(newLeaf); + ((RBNode*)newLeaf)->color = BLACK; + + HandleNode* hn = handle->getNewKey(); + + // connect handleNode and posNode: + hn->posNode = newLeaf; + newLeaf->handleNode = hn; + + return hn->key; + } + + while (n->getRight() != getNil()) { + n=n->getRight(); + } + + // new right child ! + PosNode *newLeaf= new PosNode(getNil(), textSize); + + + n->setRight(newLeaf); + newLeaf->setParent(n); + + updateCountersOnPathToRoot(newLeaf); + + rbInsertFixup(newLeaf, callPosUpdateCounters); + + HandleNode* hn = handle->getNewKey(); + + // connect handleNode and posNode: + hn->posNode = newLeaf; + newLeaf->handleNode = hn; + + return hn->key; +} + + + + +void Pos::updateCountersOnPathToRoot(PosNode *n) { + while (n != getNil()) { + updateCounters(n); + + n = n->getParent(); + } +} + +void Pos::updateCounters(PosNode *n) { + n->subtreeSize=1; + //n->sumTextSize = n->textSize; + + n->subtreeSize += n->getLeft()->subtreeSize; + //n->sumTextSize += n->getLeft()->sumTextSize; + + n->subtreeSize += n->getRight()->subtreeSize; + //n->sumTextSize += n->getRight()->sumTextSize; +} + + + +void callPosUpdateCounters(RBNode *n, RBTree *T){ + ((Pos *)T)->updateCounters((PosNode*) n); +} + +void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){ + ((Pos *)T)->updateCountersOnPathToRoot((PosNode*) n); +} + diff --git a/pos.h b/pos.h new file mode 100644 index 0000000..4fcdbb7 --- /dev/null +++ b/pos.h @@ -0,0 +1,139 @@ + +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + + //TODO check star color of the nodes pos and handle! + +#ifndef Pos +#define Pos Pos + +#ifndef uchar +#define uchar unsigned char +#endif +#ifndef ulong +#define ulong unsigned long +#endif + +#include "rbtree.h" +#include "handle.h" + +class Handle; +class HandleNode; + +void callPosUpdateCounters(RBNode *n, RBTree *T); +void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T); + +class PosNode : public RBNode { + public: + + HandleNode *handleNode; + ulong subtreeSize; + ulong textSize; // size including endmarker! + //ulong sumTextSize; // sum of textlength of all texts located in this subtree; + + PosNode() // NIL + : RBNode(this),subtreeSize(0),textSize(0) { + } + + PosNode(PosNode *n, ulong textSize) + : RBNode(n),subtreeSize(1),textSize(textSize) { + } + + + PosNode* getParent(){ + return ((PosNode*) ((RBNode*) this)->parent); + } + + PosNode* getLeft(){ + return ((PosNode*) ((RBNode*) this)->left); + } + + PosNode* getRight(){ + return ((PosNode*) ((RBNode*) this)->right); + } + + void setParent(PosNode* n){ + ((RBNode*) this)->parent=(RBNode*)n; + } + + void setLeft(PosNode* n){ + ((RBNode*) this)->left=(RBNode*)n; + } + + void setRight(PosNode* n){ + ((RBNode*) this)->right=(RBNode*)n; + } +}; + +class Pos: public RBTree +{ + public: + + Handle* handle; + + ulong textNumber; + ulong matchPosition; + + ulong sampleInterval; + + Pos(ulong sampleInterval){ + setNil(new PosNode()); + setRoot(getNil()); + this->sampleInterval=sampleInterval; + } + + void setRoot(PosNode* n){ + ((RBTree*) this)->root=(RBNode*)n; + } + + PosNode* getRoot(){ + return ((PosNode*) ((RBTree*) this)->root); + } + + void setNil(PosNode* n){ + ((RBTree*) this)->nil=(RBNode*)n; + } + + PosNode* getNil(){ + return ((PosNode*) ((RBTree*) this)->nil); + } + + ulong getSize(){ + return (getRoot()!=getNil())?getRoot()->subtreeSize:0; + } + + + + ulong getPos(PosNode *n); + + + PosNode* getPosNode(ulong text); + ulong getTextSize(ulong pos); + void deleteText(ulong pos); + void deletePosNode(PosNode *n); + ulong appendText(ulong textSize); + + void updateCountersOnPathToRoot(PosNode *n); + void updateCounters(PosNode *n); + +}; + +#endif diff --git a/rbtree.cpp b/rbtree.cpp new file mode 100644 index 0000000..09326c1 --- /dev/null +++ b/rbtree.cpp @@ -0,0 +1,454 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +#include +#include +#include "rbtree.h" + + +using std::cout; +using std::endl; +using std::cerr; + + +void RBTree::checkTree(){ + cout << "start" << endl; + this->root->countBlack(0); + cout << "stop" << endl; +} + +void RBTree::leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode* y = x->right; + x->right=y->left; + if (y->left != nil) y->left->parent=x; + y->parent=x->parent; + if (x->parent==this->nil) + root=y; + else { + if (x == x->parent->left) + x->parent->left=y; + else + x->parent->right=y; + } + y->left=x; + x->parent=y; + + // update counters of x and y ! + if (updateNode != 0) { + updateNode(x, this); + updateNode(y, this); + } +} + +void RBTree::rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode* y = x->left; + x->left=y->right; + if (y->right != nil) y->right->parent=x; + y->parent=x->parent; + + + if (x->parent==nil) + root=y; + else { + if (x == x->parent->right) { + x->parent->right=y; + } else { + x->parent->left=y; + } + } + + y->right=x; + x->parent=y; + if (updateNode != 0) { + updateNode(x, this); + updateNode(y, this); + } +} + +void RBTree::rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode *y; + + if (z->parent==nil) return; + while (z->parent->color == RED) { + if (z->parent == z->parent->parent->left) + { + y = z->parent->parent->right; + + if (y->color==RED) + { + z->parent->color=BLACK; // Case 1a + y->color=BLACK; // Case 1a + z->parent->parent->color=RED; // Case 1a + z=z->parent->parent; // Case 1a + } + else + { + if (z==z->parent->right) + { + z=z->parent; // Case 2a + leftRotate(z, updateNode); // Case 2a + } + z->parent->color=BLACK; // Case 3a + z->parent->parent->color=RED; // Case 3a + rightRotate(z->parent->parent, updateNode); // Case 3a + } + } + else + { + + y = z->parent->parent->left; + + if (y->color==RED) + { + z->parent->color=BLACK; // Case 1b + y->color=BLACK; // Case 1b + z->parent->parent->color=RED; // Case 1b + z=z->parent->parent; // Case 1b + } + else + { + if (z==z->parent->left) + { + z=z->parent; // Case 2b + rightRotate(z, updateNode); // Case 2b + } + z->parent->color=BLACK; // Case 3b + z->parent->parent->color=RED; // Case 3b + leftRotate(z->parent->parent, updateNode); // Case 3b + } + } + if (z->parent==nil) return; + } //end of while + + root->color=BLACK; +} + + +void RBTree::rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode *w; + + while ((x != root) && (x->color == BLACK)) + { + if (x == x->parent->left) + { + w=x->parent->right; + + if (w->color == RED) + { + w->color=BLACK; // Case 1a + x->parent->color=RED; // Case 1a + leftRotate(x->parent, updateNode); // Case 1a + w=x->parent->right; // Case 1a + } + if ((w->left->color == BLACK) && (w->right->color == BLACK)) + { + w->color=RED; // Case 2a + x=x->parent; // Case 2a + } + else + { + if (w->right->color == BLACK) + { + w->left->color=BLACK; // Case 3a + w->color=RED; // Case 3a + rightRotate(w, updateNode); // Case 3a + w=x->parent->right; // Case 3a + } + w->color=x->parent->color; // Case 4a + x->parent->color=BLACK; // Case 4a + w->right->color=BLACK; // Case 4a + leftRotate(x->parent, updateNode); // Case 4a + x=root; // Case 4a + } + } + else + { + w=x->parent->left; + + if (w->color == RED) + { + w->color=BLACK; // Case 1b + x->parent->color=RED; // Case 1b + rightRotate(x->parent, updateNode); // Case 1b + w=x->parent->left; // Case 1b + } + if ((w->right->color == BLACK) && (w->left->color == BLACK)) + { + w->color=RED; // Case 2b + x=x->parent; // Case 2b + } + else + { + if (w->left->color == BLACK) + { + w->right->color=BLACK; // Case 3b + w->color=RED; // Case 3b + leftRotate(w, updateNode); // Case 3b + w=x->parent->left; // Case 3b + } + + w->color=x->parent->color; // Case 4b + x->parent->color=BLACK; // Case 4b + w->left->color=BLACK; // Case 4b + rightRotate(x->parent, updateNode); // Case 4b + + x=root; // Case 4b + } + } + } // end of while + x->color=BLACK; +} + +// quite similar to Cormen et al +void RBTree::rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T)){ + RBNode *y,*x,*y_oldParent; + y_oldParent=nil; + if (z->left == nil || z->right == nil) { + y = z; //z has no or one child, deletion is easy + } else { + y = treeSuccessor(z); + y_oldParent=y->parent; + } + + if (y->left != nil) { + x=y->left; + } else { + x=y->right; + } + + x->parent=y->parent; + + // cut out y: + if (y->parent == nil) { + root=x; + } else { + if (isLeftChild(y)) {y->parent->left = x;} + else {y->parent->right = x;} + } + + RBNodecolor old_y = y->color; + if (y != z) { // 2 children + //move y to z's old position and delete z + if (root==z) { + root=y; + } else { + if (isLeftChild(z)) z->parent->left = y; + else z->parent->right = y; + } + + y->parent = z->parent; + y->left = z->left; + y->right = z->right; + y->color = z->color; // don't forget to delete z after rbDelete + + y->left->parent = y; + y->right->parent = y; + + } + + + if (old_y == BLACK) { + rbDeleteFixup(x, updateNode); + } + + updateNode(y, this); + if (y_oldParent!=nil) updateNode(y_oldParent, this); + +} + + +RBNode* RBTree::treeSuccessor(RBNode *x){ + if (x->right != nil) return treeMinimum(x->right); + + RBNode *y = x->parent; + while ((y!=nil) && (x == y->right)) { + x=y; + y=y->parent; + } + return y; +} + +RBNode* RBTree::treePredecessor(RBNode *x){ + if (x->left != nil) return treeMaximum(x->left); + + RBNode *y = x->parent; + while ((y!=nil) && (x == y->left)) { + x=y; + y=y->parent; + } + return y; +} + + +RBNode* RBTree::treeMinimum(RBNode *x){ + while (x->left != nil) x=x->left; + return x; +} + +RBNode* RBTree::treeMaximum(RBNode *x){ + while (x->right != nil) x=x->right; + return x; +} + + +bool RBTree::isLeftChild(RBNode *n){ + #ifndef NDEBUG + if (n->parent == nil) { + cerr << "error: isLeftChild, no parent." << endl; + exit(EXIT_FAILURE); + } + #endif + return (n->parent->left == n); +} + +bool RBTree::isRightChild(RBNode *n){ + #ifndef NDEBUG + if ( n->parent == nil) { + cerr << "error: isLeftChild, no parent." << endl; + exit(EXIT_FAILURE); + } + #endif + return (n->parent->right == n); +} + + +RBNode* RBTree::findRightSiblingLeaf(RBNode *n){ + // go up: + while (true) { + if (n->parent!=nil) { + if (n==n->parent->right) + n=n->parent; + else + break; + } else + return nil; + } + + n=n->parent; + + // leftmost leaf in n is the right sibling searched + n=n->right; + + // go down: + while (n->left != nil) { + n=n->left; + } + + return n; +} + +RBNode* RBTree::findLeftSiblingLeaf(RBNode *n){ + // go up: + while (true) { + if (n->parent!=nil) { + if (n==n->parent->left) + n=n->parent; + else + break; + } else + return nil; + } + + n=n->parent; + + // rightmost leaf in n is the left sibling searched + n=n->left; + + // go down: + while (n->right != nil) { + n=n->right; + } + + return n; +} + + +int RBTree::getNodeMaxDepth(RBNode *n) { + int l; + int r; + if (n->left == nil) l=0; + else l=getNodeMaxDepth(n->left); + if (n->right == nil) r=0; + else r=getNodeMaxDepth(n->right); + + return (1+(l>r?l:r)); +} + +int RBTree::getNodeMinDepth(RBNode *n) { + int l; + int r; + if (n->left == 0) l=0; + else l=getNodeMinDepth(n->left); + if (n->right == 0) r=0; + else r=getNodeMinDepth(n->right); + + return (1+(l>r?r:l)); +} + + +void RBTree::printSubTree(RBNode *n){ + cout << ((n->color==BLACK)?"B":"R") << n << " ["; + if (n->left==nil) cout << "N"; + else printSubTree(n->left); + cout << ","; + if (n->right==nil) cout << "N"; + else printSubTree(n->right); + cout << "]"; +} + + +void RBTree::checkSubTree(RBNode *n){ + + if (n->left!=nil) { + if (n->left->parent != n) { + cout << "au"<< endl; + exit(1); + } + checkSubTree(n->left); + } + + if (n->right!=nil) { + if (n->right->parent != n) { + cout << "au"<< endl; + exit(1); + } + checkSubTree(n->right); + } + +} + + +void RBTree::checkNode(RBNode *n){ + if (n->left!=nil) { + if (n->left->parent != n) { + cout << "au"<< endl; + exit(1); + } + + } + + if (n->right!=nil) { + if (n->right->parent != n) { + cout << "au"<< endl; + exit(1); + } + } +} diff --git a/rbtree.h b/rbtree.h new file mode 100644 index 0000000..b59c0a3 --- /dev/null +++ b/rbtree.h @@ -0,0 +1,116 @@ +/*************************************************************************** + * Copyright (C) 2006 by Wolfgang Gerlach * + * No object matches key 'wgerlach'. * + * * + * This program is free software; you can redistribute it and/or modify * + * it under the terms of the GNU General Public License as published by * + * the Free Software Foundation; either version 2 of the License, or * + * (at your option) any later version. * + * * + * This program is distributed in the hope that it will be useful, * + * but WITHOUT ANY WARRANTY; without even the implied warranty of * + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * + * GNU General Public License for more details. * + * * + * You should have received a copy of the GNU General Public License * + * along with this program; if not, write to the * + * Free Software Foundation, Inc., * + * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. * + ***************************************************************************/ + + +// this is a red-black tree implementation by wolfgang Gerlach based on the algorithm provided by +// Cormen et al.: Introduction to Algorithms, Second Edition. MIT Press and McGraw-Hill, 2001 + + + +#ifndef RBTree +#define RBTree RBTree + + + + +enum RBNodecolor{BLACK,RED}; + + +// generic Red-Black Tree Node: +class RBNode +{ + public: + RBNode* parent; + RBNode* left; + RBNode* right; // 3*4 bytes + + enum RBNodecolor color; // due to structure alignment: 4 bytes !!! + + RBNode(){}; + + RBNode(RBNode *n) + : parent(n), left(n), right(n){ + color=RED; + } + + + virtual ~RBNode(){} //adds 4 bytes vtable + + void countBlack(int i){ + if (this->color == BLACK) i++; + if (this->left != this) this->left->countBlack(i); + else std::cout << i << ","; + if (this->right != this) this->right->countBlack(i); + else std::cout << i << ","; + } +}; + +class RBTree{ + public: + + RBNode *root; + RBNode *nil; + + + virtual ~RBTree(){ + // Don't double free! + if (root != this->nil) + deleteNode(root); + root = 0; + delete this->nil; + this->nil = 0; + } + + void checkTree(); + + void rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T)); + void rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T)); + void rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T)); + RBNode* findRightSiblingLeaf(RBNode *n); + RBNode* findLeftSiblingLeaf(RBNode *n); + RBNode* treeSuccessor(RBNode *x); + RBNode* treePredecessor(RBNode *x); + RBNode* treeMinimum(RBNode *x); + RBNode* treeMaximum(RBNode *x); + + bool isLeftChild(RBNode *n); + bool isRightChild(RBNode *n); + + int getNodeMaxDepth(RBNode *n); + int getNodeMinDepth(RBNode *n); + + void printSubTree(RBNode *n); + void checkSubTree(RBNode *n); + void checkNode(RBNode *x); + + void deleteNode(RBNode* x){ + if (x->left!=nil && x->left) deleteNode(x->left); + if (x->right!=nil && x->right) deleteNode(x->right); + delete x; + } + + + private: + void leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)); + void rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)); + +}; + +#endif diff --git a/testTextCollection.cpp b/testTextCollection.cpp new file mode 100644 index 0000000..d005b8f --- /dev/null +++ b/testTextCollection.cpp @@ -0,0 +1,69 @@ +// Test driver for text collection +#include +using std::cout; +using std::endl; + +#include "TextCollection.h" +using SXSI::TextCollection; + +void printDocumentResult(TextCollection::document_result dr) +{ + TextCollection::document_result::iterator it; + printf("document result:"); + for (it = dr.begin(); it != dr.end(); ++it) + printf(" %i", *it); + printf("\n"); + fflush(stdout); +} + +void printFullResult(TextCollection::full_result fr) +{ + TextCollection::full_result::iterator it; + printf("full result:"); + for (it = fr.begin(); it != fr.end(); ++it) + printf(" (%i, %lu)", (*it).first, (*it).second); + printf("\n"); + fflush(stdout); +} + +int main() +{ + uchar *text = (uchar*) "acabab"; + TextCollection *csa = TextCollection::InitTextCollection(1); + csa->InsertText(text); + text = (uchar*) "abaca"; + csa->InsertText(text); + text = (uchar*) "abacb"; + csa->InsertText(text); + + csa->MakeStatic(); + + text = csa->GetText(0); + cout << "Text 0: \"" << text << "\"" << endl; + delete [] text; + text = csa->GetText(1); + cout << "Text 1: \"" << text << "\"" << endl; + delete [] text; + text = csa->GetText(2); + cout << "Text 2: \"" << text << "\"" << endl; + delete [] text; + + text = csa->GetText(2, 2, 4); + cout << "Substring of Text 3: \"" << text << "\"" << endl; + delete [] text; + + printf("n:o contains: %u\n", csa->CountContains((uchar *)"ac")); + printf("n:o suffix: %u\n", csa->CountSuffix((uchar *)"b")); + printf("n:o equal: %u\n", csa->CountEqual((uchar *)"acabab")); + printf("is equal: %u\n", csa->IsEqual((uchar *)"abacb")); + + TextCollection::document_result dr; + dr = csa->Contains((uchar*)"ab"); + printDocumentResult(dr); + + TextCollection::full_result fr; + fr = csa->FullContains((uchar *)"ab"); + printFullResult(fr); + + delete csa; +}