--- /dev/null
+#include "BitRank.h"
+
+/*****************************************************************************
+ * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. *
+ * *
+ * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. *
+ * *
+ * This library is free software; you can redistribute it and/or *
+ * modify it under the terms of the GNU Lesser General Public *
+ * License as published by the Free Software Foundation; either *
+ * version 2.1 of the License, or (at your option) any later version. *
+ * *
+ * This library is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
+ * Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public *
+ * License along with this library; if not, write to the Free Software *
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *
+ ****************************************************************************/
+
+// Modified by Niko Välimäki
+
+/////////////
+//Rank(B,i)//
+/////////////
+//This Class use a superblock size of 256-512 bits
+//and a block size of 32-64 bits also
+
+
+const unsigned char __popcount_tab[] =
+{
+0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
+};
+
+const unsigned char select_tab[] =
+{
+0,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+8,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+};
+
+
+// bits needed to represent a number between 0 and n
+inline ulong bits (ulong n){
+ ulong b = 0;
+ while (n) { b++; n >>= 1; }
+ return b;
+}
+
+#if W == 32
+ // 32 bit version
+ inline unsigned popcount (register ulong x){
+ return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff];
+ }
+#else
+ // 64 bit version
+ inline unsigned popcount (register ulong x){
+ return __popcount_tab[(x >> 0) & 0xff] + __popcount_tab[(x >> 8) & 0xff] + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff] + __popcount_tab[(x >> 32) & 0xff] + __popcount_tab[(x >> 40) & 0xff] + __popcount_tab[(x >> 48) & 0xff] + __popcount_tab[(x >> 56) & 0xff];
+ }
+#endif
+
+inline unsigned popcount16 (register int x){
+ return __popcount_tab[x & 0xff] + __popcount_tab[(x >> 8) & 0xff];
+}
+
+inline unsigned popcount8 (register int x){
+ return __popcount_tab[x & 0xff];
+}
+
+BitRank::BitRank(ulong *bitarray, ulong n, bool owner) {
+ data=bitarray;
+ this->owner = owner;
+ this->n=n; // length of bitarray in bits
+ b = W; // b is a word
+ s=b*superFactor;
+ ulong aux=(n+1)%W;
+ if (aux != 0)
+ integers = (n+1)/W+1;
+ else
+ integers = (n+1)/W;
+ BuildRank();
+}
+
+BitRank::~BitRank() {
+ delete [] Rs;
+ delete [] Rb;
+ if (owner) delete [] data;
+}
+
+//Build the rank (blocks and superblocks)
+void BitRank::BuildRank()
+{
+ ulong num_sblock = n/s;
+ ulong num_block = n/b;
+ Rs = new ulong[num_sblock+1];//+1 we add the 0 pos
+ Rb = new uchar[num_block+1];//+1 we add the 0 pos
+
+ ulong j;
+ Rs[0] = 0lu;
+
+ for (j=1;j<=num_sblock;j++)
+ {
+ Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1];
+ }
+
+ Rb[0]=0;
+ for (ulong k=1;k<=num_block;k++) {
+ j = k / superFactor;
+ Rb[k]=BuildRankSub(j*superFactor, k%superFactor);
+ }
+}
+
+ulong BitRank::BuildRankSub(ulong ini, ulong bloques){
+ ulong rank=0,aux;
+
+
+ for(ulong i=ini;i<ini+bloques;i++) {
+ if (i < integers) {
+ aux=data[i];
+ rank+=popcount(aux);
+ }
+ }
+ return rank; //return the numbers of 1's in the interval
+}
+
+
+//this rank ask from 0 to n-1
+ulong BitRank::rank(ulong i) {
+ ++i; // the following gives sum of 1s before i
+ return Rs[i>>8]+Rb[i>>wordShift]
+ +popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1));
+}
+
+ulong BitRank::select(ulong x) {
+ // returns i such that x=rank(i) && rank(i-1)<x or n if that i not exist
+ // first binary search over first level rank structure
+ // then sequential search using popcount over a int
+ // then sequential search using popcount over a char
+ // then sequential search bit a bit
+
+ //binary search over first level rank structure
+ if (x == 0)
+ return 0;
+
+ ulong l=0, r=n/s;
+ ulong mid=(l+r)/2;
+ ulong rankmid = Rs[mid];
+ while (l<=r) {
+ if (rankmid<x)
+ l = mid+1;
+ else
+ r = mid-1;
+ mid = (l+r)/2;
+ rankmid = Rs[mid];
+ }
+ //sequential search using popcount over a int
+ ulong left;
+ left=mid*superFactor;
+ x-=rankmid;
+ ulong j = data[left];
+
+ unsigned ones = popcount(j);
+ while (ones < x) {
+ x-=ones;left++;
+ if (left > integers)
+ return n;
+
+ j = data[left];
+
+ ones = popcount(j);
+ }
+ //sequential search using popcount over a char
+ left=left*b;
+ rankmid = popcount8(j);
+ if (rankmid < x) {
+ j=j>>8;
+ x-=rankmid;
+ left+=8;
+ rankmid = popcount8(j);
+ if (rankmid < x) {
+ j=j>>8;
+ x-=rankmid;
+ left+=8;
+ rankmid = popcount8(j);
+ if (rankmid < x) {
+ j=j>>8;
+ x-=rankmid;
+ left+=8;
+ }
+ }
+ }
+
+ // then sequential search bit a bit
+ while (x>0) {
+ if (j&1lu) x--;
+ j=j>>1;
+ left++;
+ }
+ return left-1;
+}
+
+ulong BitRank::select0(ulong x) {
+ // returns i such that x=rank0(i) && rank0(i-1)<x or n if that i not exist
+ // first binary search over first level rank structure
+ // then sequential search using popcount over a int
+ // then sequential search using popcount over a char
+ // then sequential search bit a bit
+
+ //binary search over first level rank structure
+ if (x == 0)
+ return 0;
+
+ ulong l=0, r=n/s;
+ ulong mid=(l+r)/2;
+ ulong rankmid = mid * s - Rs[mid];
+ while (l<=r) {
+ if (rankmid<x)
+ l = mid+1;
+ else
+ r = mid-1;
+ mid = (l+r)/2;
+ rankmid = mid * s - Rs[mid];
+ }
+
+ //sequential search using popcount over a int
+ ulong left;
+ left=mid*superFactor;
+ x-=rankmid;
+ ulong j = data[left];
+
+ unsigned zeros = W - popcount(j);
+ while (zeros < x) {
+ x-=zeros;
+ left++;
+ if (left > integers)
+ return n;
+
+ j = data[left];
+ zeros = W - popcount(j);
+ }
+
+ //sequential search using popcount over a char
+ left=left*b;
+ rankmid = 8 - popcount8(j);
+ if (rankmid < x) {
+ j=j>>8;
+ x-=rankmid;
+ left+=8;
+ rankmid = 8 - popcount8(j);
+ if (rankmid < x) {
+ j=j>>8;
+ x-=rankmid;
+ left+=8;
+ rankmid = 8 - popcount8(j);
+ if (rankmid < x) {
+ j=j>>8;
+ x-=rankmid;
+ left+=8;
+ }
+ }
+ }
+
+ // then sequential search bit a bit
+ while (x>0) {
+ if (!(j&1lu)) x--;
+ j=j>>1;
+ left++;
+ }
+ return left - 1;
+}
+
+
+bool BitRank::IsBitSet(ulong i) {
+ return (1lu << (i % W)) & data[i/W];
+}
+
+ulong BitRank::NumberOfBits() {
+ return n;
+}
--- /dev/null
+/*****************************************************************************
+ * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved. *
+ * *
+ * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations. *
+ * *
+ * This library is free software; you can redistribute it and/or *
+ * modify it under the terms of the GNU Lesser General Public *
+ * License as published by the Free Software Foundation; either *
+ * version 2.1 of the License, or (at your option) any later version. *
+ * *
+ * This library is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU *
+ * Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public *
+ * License along with this library; if not, write to the Free Software *
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA *
+ ****************************************************************************/
+
+#ifndef _BITSTREAM_H_
+#define _BITSTREAM_H_
+
+#include "Tools.h"
+
+class BitRank {
+private:
+ // Check word length
+#if W == 32
+ static const unsigned wordShift = 5;
+ static const unsigned superFactor = 8; // 256 bit blocks
+#else
+ static const unsigned wordShift = 6;
+ static const unsigned superFactor = 4; // 256 bit blocks
+#endif
+
+ ulong *data; //here is the bit-array
+ bool owner;
+ ulong n,integers;
+ unsigned b,s;
+ ulong *Rs; //superblock array
+ uchar *Rb; //block array
+ ulong BuildRankSub(ulong, ulong); //internal use of BuildRank
+ void BuildRank(); //crea indice para rank
+public:
+ BitRank(ulong *, ulong, bool);
+ ~BitRank(); //destructor
+ ulong rank(ulong i); //Rank from 0 to n-1
+ ulong select(ulong x); // gives the position of the x:th 1.
+ ulong select0(ulong x); // gives the position of the x:th 0.
+
+ bool IsBitSet(ulong i);
+ ulong NumberOfBits();
+};
+
+#endif
--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+#include "CSA.h"
+#include "TextCollection.h"
+#include <iostream>
+#include <queue>
+#include <iomanip>
+#include <set>
+#include <vector>
+#include <utility>
+#include <cassert>
+#include <cstring> // For strlen()
+using std::vector;
+using std::pair;
+using std::make_pair;
+using std::map;
+
+using SXSI::TextCollection;
+
+
+////////////////////////////////////////////////////////////////////////////
+// Class CSA::THuffAlphabetRank
+
+CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
+ left = NULL;
+ right = NULL;
+ bitrank = NULL;
+ ch = s[0];
+ leaf = false;
+ this->codetable = codetable;
+
+ bool *B = new bool[n];
+ TextPosition sum=0,i;
+ /*
+ for (i=0; i< n; i++) {
+ printf("%c:", (char)((int)s[i]-128));
+ for (r=0;r<codetable[(int)s[i]].bits;r++)
+ if (codetable[(int)s[i]].code & (1u <<r))
+ printf("1");
+ else printf("0");
+ printf("\n");
+ }
+ printf("\n");
+ if (level > 100) return;*/
+ for (i=0;i<n;i++)
+ if (codetable[(int)s[i]].code & (1u << level)) {
+ B[i] = true;
+ sum++;
+ }
+ else B[i] = false;
+ if (sum==0 || sum==n) {
+ delete [] B;
+ leaf = true;
+ return;
+ }
+ uchar *sfirst, *ssecond;
+ //if (n-sum > 0)
+ sfirst = new uchar[n-sum];
+ //if (sum > 0)
+ ssecond = new uchar[sum];
+ unsigned j=0,k=0;
+ for (i=0;i<n;i++)
+ if (B[i]) ssecond[k++] = s[i];
+ else sfirst[j++] = s[i];
+ ulong *Binbits = new ulong[n/W+1];
+ for (i=0;i<n;i++)
+ Tools::SetField(Binbits,1,i,B[i]);
+ delete [] B;
+ bitrank = new BitRank(Binbits,n,true);
+ //if (j > 0) {
+ left = new THuffAlphabetRank(sfirst,j,codetable,level+1);
+ delete [] sfirst;
+ //}
+ //if (k>0) {
+ right = new THuffAlphabetRank(ssecond,k,codetable,level+1);
+ delete [] ssecond;
+ //}
+}
+
+
+bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
+ // testing that the code works correctly
+ int C[256];
+ unsigned i,j;
+ bool correct=true;
+ for (j=0;j<256;j++)
+ C[j] = 0;
+ for (i=0;i<n;i++) {
+ C[(int)s[i]]++;
+ if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
+ correct = false;
+ printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i));
+ }
+ }
+ return correct;
+}
+
+CSA::THuffAlphabetRank::~THuffAlphabetRank() {
+ if (left!=NULL) delete left;
+ if (right!=NULL) delete right;
+ if (bitrank!=NULL)
+ delete bitrank;
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+// Class CSA
+
+/**
+ * Constructor inits an empty dynamic FM-index.
+ * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
+ */
+CSA::CSA(unsigned samplerate)
+ : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),
+ codetable(0), dynFMI(0)
+{
+ this->samplerate = samplerate;
+
+ // FIXME TODO : DynFMI needs distribution of characters before hand
+ // This will create fully balanced wavelet tree for all chars [0, 255].
+ uchar temp[256];
+ for (unsigned i = 0; i < 255; ++i)
+ temp[i] = i+1;
+ temp[255] = 0;
+ dynFMI = new DynFMI(temp, 1, 255, false);
+}
+
+/**
+ * Insert new text
+ *
+ * Given text must include end-marker.
+ * Text identifiers are numbered starting from 0.
+ */
+void CSA::InsertText(uchar const * text)
+{
+ // Sanity check:
+ assert(dynFMI != 0);
+
+ TextPosition m = std::strlen((char *)text) + 1;
+ // Store text length and starting position
+ textLength.push_back(make_pair(m, n));
+ this->n += m;
+ dynFMI->addText(text, m);
+}
+
+void CSA::MakeStatic()
+{
+ // Sanity check:
+ if (dynFMI == 0)
+ {
+ std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
+ std::exit(0);
+ }
+
+ uchar *bwt = dynFMI->getBWT();
+ /*printf("1234567890123456789\n");
+ for (TextPosition i = 0; i < n; i ++)
+ if (bwt[i] < 50)
+ printf("%d", (int)bwt[i]);
+ else
+ printf("%c", bwt[i]);
+ printf("\n");
+ */
+
+/* for (TextPosition i = 1; i <= n; i ++)
+ {
+ printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
+ }*/
+
+ // Sanity check
+ assert(textLength.size() == dynFMI->getCollectionSize());
+
+ delete dynFMI;
+ dynFMI = 0;
+
+ ulong i, min = 0,
+ max;
+ for (i=0;i<256;i++)
+ C[i]=0;
+ for (i=0;i<n;++i)
+ C[(int)bwt[i]]++;
+ for (i=0;i<256;i++)
+ if (C[i]>0) {min = i; break;}
+ for (i=255;i>=min;--i)
+ if (C[i]>0) {max = i; break;}
+
+ // Print frequencies
+/* for(i = 0; i < 256; i++)
+ if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
+ fflush(stdout);*/
+
+ ulong prev=C[0], temp;
+ C[0]=0;
+ for (i=1;i<256;i++) {
+ temp = C[i];
+ C[i]=C[i-1]+prev;
+ prev = temp;
+ }
+ this->codetable = node::makecodetable(bwt,n);
+ alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);
+ //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");
+
+/* for (i = 0; i < n; ++i)
+ {
+ uchar c = alphabetrank->charAtPos(i);
+ TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
+ printf("LF[%lu] = %lu\n", i, j);
+ }*/
+
+ // Calculate BWT end-marker position (of last inserted text)
+ i = 0;
+ while (bwt[i] != '\0')
+ {
+ uchar c = alphabetrank->charAtPos(i);
+ i = C[c]+alphabetrank->rank(c, i)-1;
+ }
+ bwtEndPos = i;
+ //printf("bwtEndPos = %lu\n", bwtEndPos);
+
+ delete [] bwt;
+
+ // Make sampling tables
+ maketables();
+ // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small
+ // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+...
+ //printf("FMindex takes %d B\n",
+ // 6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8);
+}
+
+
+uchar* CSA::GetText(DocId k) const
+{
+ assert(k < (DocId)textLength.size());
+
+ uchar* result = new uchar[textLength[k].first];
+ TextPosition i = k;
+ TextPosition j = textLength[k].first-1;
+ result[j] = '\0';
+
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0')
+ {
+ --j;
+ result[j] = c;
+ i = C[c]+alphabetrank->rank(c,i)-1;
+
+ c = alphabetrank->charAtPos(i); // "next" char.
+ }
+ assert(j == 0);
+ return result;
+}
+
+uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
+{
+ assert(k < (DocId)textLength.size());
+ assert(j < textLength[k].first);
+ assert(i <= j);
+
+ // Start position of k'th text
+ ulong start = textLength[k].second;
+
+ return Substring(i + start, j-i+1);
+}
+
+/**
+ * Existential queries
+ */
+bool CSA::IsPrefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ if (it != endmarkers.end() && it->first <= ep)
+ return true;
+ return false;
+}
+
+bool CSA::IsSuffix(uchar const *pattern) const
+{
+ // Here counting is as fast as isSuffix():
+ if (CountSuffix(pattern) > 0)
+ return true;
+ return false;
+}
+
+bool CSA::IsEqual(uchar const *pattern) const
+{
+ TextPosition m = std::strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ // Check for end-marker(s) in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ if (it != endmarkers.end() && it->first <= ep)
+ return true;
+ return false;
+}
+
+bool CSA::IsContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return true;
+
+ TextPosition sp = 0, ep = 0;
+ // Just check if pattern exists somewhere
+ ulong count = Search(pattern, m, &sp, &ep);
+
+ if (count > 0)
+ return true;
+ return false;
+}
+
+bool CSA::IsLessThan(uchar const*) const
+{
+ // TODO
+ return false;
+}
+
+/**
+ * Counting queries
+ */
+unsigned CSA::CountPrefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ // Count end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ ++ count;
+ ++ it;
+ }
+
+ return count;
+}
+
+unsigned CSA::CountSuffix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
+
+ return count;
+}
+
+unsigned CSA::CountEqual(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return textLength.size();
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ // Count end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ ++ count;
+ ++ it;
+ }
+
+ return count;
+}
+
+unsigned CSA::CountContains(uchar const * pattern) const
+{
+ // Here counting is as slow as fetching the result set
+ // because we would have to filter out occ's that fall within same document.
+ TextCollection::document_result result = Contains(pattern);
+ return result.size();
+}
+
+unsigned CSA::CountLessThan(uchar const * pattern) const
+{
+ // TODO
+ return 0;
+}
+
+/**
+ * Document reporting queries
+ */
+TextCollection::document_result CSA::Prefix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ Search(pattern, m, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ // End-marker we found belongs to the "previous" doc in the collection
+ DocId docId = ((it->second).first + 1) % textLength.size();
+ result.push_back(docId);
+ ++ count;
+ ++ it;
+ }
+
+ return result;
+}
+
+TextCollection::document_result CSA::Suffix(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search with end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0' && !sampled->IsBitSet(i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->charAtPos(i);
+ }
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ // End-marker that we found belongs to the "previous" doc in collection:
+ DocId docId = (endm.first + 1) % textLength.size();
+ result.push_back(docId);
+ }
+ else
+ {
+ TextPosition textPos = suffixes[sampled->rank(i)-1];
+ result.push_back(DocIdAtTextPos(textPos));
+ }
+ }
+
+ return result;
+}
+
+TextCollection::document_result CSA::Equal(uchar const *pattern) const
+{
+ TextPosition m = strlen((char const *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Match including end-marker
+ Search(pattern, m+1, &sp, &ep);
+
+ TextCollection::document_result result;
+
+ // Report end-markers in result interval
+ map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+ it = endmarkers.lower_bound(sp);
+ unsigned count = 0;
+ while (it != endmarkers.end() && it->first <= ep)
+ {
+ // End-marker that we found belongs to the "previous" doc in collection:
+ DocId docId = ((it->second).first + 1) % textLength.size();
+ result.push_back(docId);
+ ++ count;
+ ++ it;
+ }
+
+ return result;
+}
+
+
+TextCollection::document_result CSA::Contains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return TextCollection::document_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ // We want unique document indentifiers, using std::set to collect them
+ std::set<DocId> resultSet;
+
+ // Check each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0' && !sampled->IsBitSet(i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->charAtPos(i);
+ }
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ // End-marker we found belongs to the "previous" doc in collection
+ DocId docId = (endm.first + 1) % textLength.size();
+ resultSet.insert(docId);
+ }
+ else
+ {
+ TextPosition textPos = suffixes[sampled->rank(i)-1];
+ resultSet.insert(DocIdAtTextPos(textPos));
+ }
+ }
+
+ // Convert std::set to std::vector (TODO better way to construct result vector?)
+ TextCollection::document_result result;
+ for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
+ result.push_back(*it);
+ return result;
+}
+
+TextCollection::document_result CSA::LessThan(uchar const * pattern) const
+{
+ // TODO
+ return document_result();
+}
+
+/**
+ * Full result set queries
+ */
+TextCollection::full_result CSA::FullContains(uchar const * pattern) const
+{
+ TextPosition m = strlen((char *)pattern);
+ if (m == 0)
+ return full_result();
+
+ TextPosition sp = 0, ep = 0;
+ // Search all occurrences
+ Search(pattern, m, &sp, &ep);
+
+ full_result result;
+
+ // Report each occurrence
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->charAtPos(i);
+ while (c != '\0' && !sampled->IsBitSet(i))
+ {
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ c = alphabetrank->charAtPos(i);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ // End-marker we found belongs to the "previous" doc in the collection:
+ DocId docId = (endm.first+1)%textLength.size();
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
+ // Read document identifier and its starting position in text order
+ DocId docId = DocIdAtTextPos(textPos);
+ result.push_back(make_pair(docId, textPos - textLength[docId].second));
+ }
+ }
+
+ return result;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions.
+ */
+TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
+{
+ assert(i < n);
+
+ DocId a = 0;
+ DocId b = textLength.size() - 1;
+ while (a < b)
+ {
+ DocId c = a + (b - a)/2;
+ if (textLength[c].second > i)
+ b = c - 1;
+ else if (textLength[c+1].second > i)
+ return c;
+ else
+ a = c + 1;
+ }
+
+ assert(a < (DocId)textLength.size());
+ assert(i > textLength[a].second);
+ assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
+ return a;
+}
+
+void CSA::Load(FILE *filename, unsigned samplerate)
+{
+ // TODO
+
+}
+
+void CSA::Save(FILE *filename)
+{
+ // TODO
+}
+
+
+/**
+ * Rest of the functions follow...
+ */
+
+uchar * CSA::Substring(TextPosition i, TextPosition l) const
+{
+ uchar *result = new uchar[l + 1];
+ if (l == 0)
+ {
+ result[0] = 0u;
+ return result;
+ }
+
+ TextPosition dist;
+ TextPosition k = i + l - 1;
+ // Check for end of the string
+ if (k > n - 1)
+ {
+ l -= k - n + 1;
+ k = n - 1;
+ }
+
+ TextPosition skip = samplerate - k % samplerate - 1;
+ TextPosition j;
+ if (k / samplerate + 1 >= n / samplerate)
+ {
+ j = bwtEndPos;
+ skip = n - k - 1;
+ }
+ else
+ {
+ j = positions[k/samplerate+1];
+ //cout << samplerate << ' ' << j << '\n';
+ }
+
+ for (dist = 0; dist < skip + l; dist++)
+ {
+ int c = alphabetrank->charAtPos(j);
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
+ j = endm.first; // LF-mapping for end-marker
+ }
+ else
+ j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
+ if (dist >= skip)
+ result[l + skip - dist - 1] = c;
+ }
+ result[l] = 0u;
+ return result;
+}
+
+ /*TextPosition CSA::inverse(TextPosition i)
+{
+ TextPosition skip = samplerate - i % samplerate;
+ TextPosition j;
+ if (i / samplerate + 1 >= n / samplerate)
+ {
+ j = bwtEndPos;
+ skip = n - i;
+ }
+ else
+ {
+ j = positions[i/samplerate+1];
+ //cout << samplerate << ' ' << j << '\n';
+ }
+
+ while (skip > 0)
+ {
+ int c = alphabetrank->charAtPos(j);
+ j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
+ skip --;
+ }
+ return j;
+ }*/
+
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
+ // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+ int c = (int)pattern[m-1];
+ TextPosition i=m-1;
+ TextPosition sp = C[c];
+ TextPosition ep = C[c+1]-1;
+ while (sp<=ep && i>=1)
+ {
+// printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+ c = (int)pattern[--i];
+ sp = C[c]+alphabetrank->rank(c,sp-1);
+ ep = C[c]+alphabetrank->rank(c,ep)-1;
+ }
+ *spResult = sp;
+ *epResult = ep;
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+ }
+
+ /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep)
+{
+ sp = C[(int)c]+alphabetrank->rank(c,sp-1);
+ ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
+
+ if (sp<=ep)
+ return ep - sp + 1;
+ else
+ return 0;
+ }*/
+
+CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
+{
+ TextPosition dist=0;
+ while (!sampled->IsBitSet(i))
+ {
+ int c = alphabetrank->charAtPos(i);
+ if (c == '\0')
+ {
+ // map::operator[] is not const, using find() instead:
+ pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+ return endm.second + dist; // returns text position.
+ }
+ i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+ ++dist;
+ }
+
+ return suffixes[sampled->rank(i)-1]+dist;
+}
+
+CSA::~CSA() {
+ delete alphabetrank;
+ delete sampled;
+ delete [] suffixes;
+ delete [] positions;
+ delete [] codetable;
+}
+
+
+void CSA::maketables()
+{
+ ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
+
+ ulong *sampledpositions = new ulong[n/W+1];
+ suffixes = new ulong[sampleLength];
+ positions = new ulong[sampleLength];
+
+ ulong i,j=0;
+ for (i=0;i<n/W+1;i++)
+ sampledpositions[i]=0lu;
+
+ ulong x,p=bwtEndPos;
+ DocId textId = textLength.size();
+ ulong ulongmax = 0;
+ ulongmax--;
+
+ //positions:
+ for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
+ // i substitutes SA->GetPos(i)
+ x=(i==n-1)?0:i+1;
+
+ if (x % samplerate == 0) {
+ Tools::SetField(sampledpositions,1,p,1);
+ positions[x/samplerate] = p;
+ }
+
+ uchar c = alphabetrank->charAtPos(p);
+ if (c == '\0')
+ {
+ // Record the order of end-markers in BWT:
+ --textId;
+ endmarkers[p] = make_pair(textId, (TextPosition)x);
+ // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
+ p = textId; // Correct LF-mapping to the last char of the previous text.
+ }
+ else
+ p = C[c]+alphabetrank->rank(c, p)-1;
+ }
+ assert(textId == 0);
+ /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
+ {
+ printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
+ }*/
+
+ sampled = new BitRank(sampledpositions,n,true);
+
+ //suffixes:
+ for(i=0; i<sampleLength; i++) {
+ j = sampled->rank(positions[i]);
+ if (j==0) j=sampleLength;
+ suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
+ }
+}
+
+uchar * CSA::LoadFromFile(const char *filename)
+{
+ uchar *s;
+ std::ifstream file (filename, std::ios::in|std::ios::binary);
+ if (file.is_open())
+ {
+ std::cerr << "Loading CSA from file: " << filename << std::endl;
+ file.read((char *)&bwtEndPos, sizeof(TextPosition));
+ s = new uchar[n];
+ for (ulong offset = 0; offset < n; offset ++)
+ file.read((char *)(s + offset), sizeof(char));
+ file.close();
+ }
+ else
+ {
+ std::cerr << "Unable to open file " << filename << std::endl;
+ exit(1);
+ }
+ return s;
+}
+
+void CSA::SaveToFile(const char *filename, uchar *bwt)
+{
+ std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
+ if (file.is_open())
+ {
+ std::cerr << "Writing CSA to file: " << filename << std::endl;
+ file.write((char *)&bwtEndPos, sizeof(TextPosition));
+ std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
+ for (ulong offset = 0; offset < n; offset ++)
+ file.write((char *)(bwt + offset), sizeof(char));
+ file.close();
+ }
+ else
+ {
+ std::cerr << "Unable to open file " << filename << std::endl;
+ exit(1);
+ }
+}
+
+/*uchar * CSA::BWT(uchar *text)
+{
+ uchar *s;
+
+ DynFMI *wt = new DynFMI((uchar *) text, n);
+ s = wt->getBWT();
+ for (ulong i=0;i<n;i++)
+ if (s[i]==0u) {
+ bwtEndPos = i; // TODO: better solution ?
+ i=n;
+ }
+
+ delete wt;
+ return s;
+ }*/
+
+CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
+{
+ TCodeEntry *result = new TCodeEntry[ 256 ];
+
+ count_chars( text, n, result );
+ std::priority_queue< node, std::vector< node >, std::greater<node> > q;
+//
+// First I push all the leaf nodes into the queue
+//
+ for ( unsigned int i = 0 ; i < 256 ; i++ )
+ if ( result[ i ].count )
+ q.push(node( i, result[ i ].count ) );
+//
+// This loop removes the two smallest nodes from the
+// queue. It creates a new internal node that has
+// those two nodes as children. The new internal node
+// is then inserted into the priority queue. When there
+// is only one node in the priority queue, the tree
+// is complete.
+//
+
+ while ( q.size() > 1 ) {
+ node *child0 = new node( q.top() );
+ q.pop();
+ node *child1 = new node( q.top() );
+ q.pop();
+ q.push( node( child0, child1 ) );
+ }
+//
+// Now I compute and return the codetable
+//
+ q.top().maketable(0u,0u, result);
+ q.pop();
+ return result;
+}
+
+
+void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
+{
+ if ( child0 )
+ {
+ child0->maketable( SetBit(code,bits,0), bits+1, codetable );
+ child1->maketable( SetBit(code,bits,1), bits+1, codetable );
+ delete child0;
+ delete child1;
+ }
+ else
+ {
+ codetable[value].code = code;
+ codetable[value].bits = bits;
+ }
+}
+
+void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
+{
+ ulong i;
+ for (i = 0 ; i < 256 ; i++ )
+ counts[ i ].count = 0;
+ for (i=0; i<n; i++)
+ counts[(int)text[i]].count++;
+}
+
+unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
+ return x | (bit << pos);
+}
+
--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+
+#ifndef _CSA_H_
+#define _CSA_H_
+
+#include <map>
+#include <vector>
+#include "BitRank.h"
+#include "dynFMI.h"
+#include "TextCollection.h"
+
+/**
+ * Implementation of the TextCollection interface
+ *
+ * Implementation notes:
+ *
+ * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
+ * which requires O(n log \sigma) bits of memory. If we know the distribution
+ * of characters beforehand, space can easily be made O(nH_0).
+ *
+ * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
+ * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
+ *
+ * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
+ * with any FM-index variant supporting LF-mapping.
+ */
+class CSA : public SXSI::TextCollection {
+public:
+ /**
+ * Constructor with (default) samplerate
+ */
+ explicit CSA(unsigned);
+ ~CSA();
+ /**
+ * Following functions implement the interface described in TextCollection.h.
+ * For details/documentation, see TextCollection.h.
+ */
+ void InsertText(uchar const *);
+ void MakeStatic();
+ uchar* GetText(DocId) const;
+ uchar* GetText(DocId, TextPosition, TextPosition) const;
+
+ bool IsPrefix(uchar const *) const;
+ bool IsSuffix(uchar const *) const;
+ bool IsEqual(uchar const *) const;
+ bool IsContains(uchar const *) const;
+ bool IsLessThan(uchar const *) const;
+
+ unsigned CountPrefix(uchar const *) const;
+ unsigned CountSuffix(uchar const *) const;
+ unsigned CountEqual(uchar const *) const;
+ unsigned CountContains(uchar const *) const;
+ unsigned CountLessThan(const unsigned char*) const;
+
+ // document_result is inherited from SXSI::TextCollection.
+ document_result Prefix(uchar const *) const;
+ document_result Suffix(uchar const *) const;
+ document_result Equal(uchar const *) const;
+ document_result Contains(uchar const *) const;
+ document_result LessThan(uchar const *) const;
+
+ // full_result is inherited from SXSI::TextCollection.
+ full_result FullContains(uchar const *) const;
+
+ // TODO implement:
+ void Load(FILE *, unsigned);
+ void Save(FILE *);
+
+private:
+ class TCodeEntry {
+ public:
+ unsigned count;
+ unsigned bits;
+ unsigned code;
+ TCodeEntry() {count=0;bits=0;code=0u;};
+ };
+
+
+ class THuffAlphabetRank {
+ // using fixed 0...255 alphabet
+ private:
+ BitRank *bitrank;
+ THuffAlphabetRank *left;
+ THuffAlphabetRank *right;
+ TCodeEntry *codetable;
+ uchar ch;
+ bool leaf;
+ public:
+ THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+ ~THuffAlphabetRank();
+ bool Test(uchar *, TextPosition);
+
+ inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
+ THuffAlphabetRank const * temp=this;
+ if (codetable[c].count == 0) return 0;
+ unsigned level = 0;
+ unsigned code = codetable[c].code;
+ while (!temp->leaf) {
+ if ((code & (1u<<level)) == 0) {
+ i = i-temp->bitrank->rank(i);
+ temp = temp->left;
+ }
+ else {
+ i = temp->bitrank->rank(i)-1;
+ temp = temp->right;
+ }
+ ++level;
+ }
+ return i+1;
+ };
+ inline bool IsCharAtPos(int c, TextPosition i) const {
+ THuffAlphabetRank const * temp=this;
+ if (codetable[c].count == 0) return false;
+ unsigned level = 0;
+ unsigned code = codetable[c].code;
+ while (!temp->leaf) {
+ if ((code & (1u<<level))==0) {
+ if (temp->bitrank->IsBitSet(i)) return false;
+ i = i-temp->bitrank->rank(i);
+ temp = temp->left;
+ }
+ else {
+ if (!temp->bitrank->IsBitSet(i)) return false;
+ i = temp->bitrank->rank(i)-1;
+ temp = temp->right;
+ }
+ ++level;
+ }
+ return true;
+ }
+ inline int charAtPos(TextPosition i) const {
+ THuffAlphabetRank const * temp=this;
+ while (!temp->leaf) {
+ if (temp->bitrank->IsBitSet(i)) {
+ i = temp->bitrank->rank(i)-1;
+ temp = temp->right;
+ }
+ else {
+ i = i-temp->bitrank->rank(i);
+ temp = temp->left;
+ }
+ }
+ return (int)temp->ch;
+ }
+ };
+
+ class node {
+ private:
+ unsigned weight;
+ uchar value;
+ node *child0;
+ node *child1;
+
+ void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
+ static void count_chars(uchar *, TextPosition , TCodeEntry *);
+ static unsigned SetBit(unsigned , unsigned , unsigned );
+ public:
+ node( unsigned char c = 0, unsigned i = 0 ) {
+ value = c;
+ weight = i;
+ child0 = 0;
+ child1 = 0;
+ }
+
+ node( node* c0, node *c1 ) {
+ value = 0;
+ weight = c0->weight + c1->weight;
+ child0 = c0;
+ child1 = c1;
+ }
+
+
+ bool operator>( const node &a ) const {
+ return weight > a.weight;
+ }
+
+ static TCodeEntry *makecodetable(uchar *, TextPosition);
+ };
+
+ static const unsigned char print = 1;
+ static const unsigned char report = 1;
+ TextPosition n;
+ unsigned samplerate;
+ ulong C[256];
+ TextPosition bwtEndPos;
+ THuffAlphabetRank *alphabetrank;
+ BitRank *sampled;
+ ulong *suffixes;
+ ulong *positions;
+ TCodeEntry *codetable;
+ DynFMI *dynFMI;
+ // Map from end-marker in BWT to pair (textId, sampled text position)
+ std::map<TextPosition, std::pair<DocId, TextPosition> > endmarkers;
+ // Vector of pairs of <text length, text start position>
+ std::vector<std::pair<TextPosition, TextPosition> > textLength;
+
+ // Private methods
+ uchar * BWT(uchar *);
+ uchar * LoadFromFile(const char *);
+ void SaveToFile(const char *, uchar *);
+ void maketables();
+
+ // Following are not part of the public API
+ DocId DocIdAtTextPos(TextPosition) const;
+ ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+ TextPosition Lookup(TextPosition) const;
+ TextPosition Inverse(TextPosition) const;
+ TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
+ TextPosition Psi(TextPosition) const;
+ uchar * Substring(TextPosition, TextPosition) const;
+};
+
+#endif
--- /dev/null
+#include "TextCollection.h"
+#include "CSA.h"
+
+namespace SXSI
+{
+ /**
+ * Init text collection
+ *
+ * See CSA.h for more details.
+ */
+ TextCollection * TextCollection::InitTextCollection(unsigned samplerate)
+ {
+ TextCollection *result = new CSA(samplerate);
+ return result;
+ }
+}
--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2008 by Niko Valimaki <nvalimak@cs.helsinki.fi> *
+ * Text collection interface for an in-memory XQuery/XPath engine *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ******************************************************************************/
+
+#ifndef _SXSI_TextCollection_h_
+#define _SXSI_TextCollection_h_
+
+#include "Tools.h" // Defines ulong and uchar.
+#include <vector>
+#include <utility> // Defines std::pair.
+
+// Default samplerate for suffix array samples
+#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64
+
+namespace SXSI
+{
+ /**
+ * General interface for a text collection
+ *
+ * Class is virtual, make objects by calling
+ * the static method InitTextCollection().
+ */
+ class TextCollection
+ {
+ public:
+ // Type of document identifier
+ typedef int DocId;
+ // Type for text position (FIXME ulong or long?)
+ typedef ulong TextPosition;
+
+ /**
+ * Init an instance of a text collection object
+ *
+ * Returns a pointer to an object implementing this interface.
+ */
+ static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
+ /**
+ * Load from a file
+ *
+ * New samplerate can be given, otherwise will use the one specified in the save file!
+ * Note: This is not a static method; call InitTextCollection() first to get the object handle.
+ *
+ * Throws an exception if something goes wrong (unlikely since we are passing a file descriptor).
+ *
+ */
+ virtual void Load(FILE *, unsigned samplerate = 0) = 0;
+ /**
+ * Save data structure into a file
+ */
+ virtual void Save(FILE *) = 0;
+ /**
+ * Virtual destructor
+ */
+ virtual ~TextCollection() { };
+ /**
+ * Insert text
+ *
+ * Must be a zero-terminated string from alphabet [1,255].
+ * Can not be called after makeStatic().
+ * The i'th text insertion gets an identifier value i-1.
+ * In other words, document identifiers start from 0.
+ */
+ virtual void InsertText(uchar const *) = 0;
+ /**
+ * Make static
+ *
+ * Convert to a static collection; reduces space and time complexities.
+ * New texts can not be inserted after this operation.
+ */
+ virtual void MakeStatic() = 0;
+
+ /**
+ * Displaying content
+ *
+ * Returns the i'th text in the collection.
+ * The numbering starts from 0.
+ */
+ virtual uchar* GetText(DocId) const = 0;
+ /**
+ * Returns substring [i, j] of k'th text
+ *
+ * Note: Parameters i and j are text positions inside the k'th text.
+ */
+ virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
+ /**
+ * Returns backwards (reverse) iterator to the end of i'th text
+ *
+ * Note: Do we need this?
+ * Forward iterator would be really in-efficient compared to
+ * getText(k) and getText(k, i, j).
+ *
+ * TODO Define and implement const_reverse_iterator.
+ */
+ //const_reverse_iterator rend(DocId) const;
+
+ /**
+ * Existential queries
+ */
+ // Is there a text prefixed by given string?
+ virtual bool IsPrefix(uchar const *) const = 0;
+ // Is there a text having given string as a suffix?
+ virtual bool IsSuffix(uchar const *) const = 0;
+ // Is there a text that equals given string?
+ virtual bool IsEqual(uchar const *) const = 0;
+ // Does a text contain given string?
+ virtual bool IsContains(uchar const *) const = 0;
+ // Is there a text that is lexicographically less than given string?
+ virtual bool IsLessThan(uchar const *) const = 0;
+
+ /**
+ * Counting queries
+ *
+ * Result is the number of documents.
+ */
+ virtual unsigned CountPrefix(uchar const *) const = 0;
+ virtual unsigned CountSuffix(uchar const *) const = 0;
+ virtual unsigned CountEqual(uchar const *) const = 0;
+ virtual unsigned CountContains(uchar const *) const = 0;
+ virtual unsigned CountLessThan(uchar const *) const = 0;
+
+ /**
+ * Document reporting queries
+ *
+ * Result is a vector of document id's in some undefined order.
+ */
+ // Data type for results
+ typedef std::vector<DocId> document_result;
+ virtual document_result Prefix(uchar const *) const = 0;
+ virtual document_result Suffix(uchar const *) const = 0;
+ virtual document_result Equal(uchar const *) const = 0;
+ virtual document_result Contains(uchar const *) const = 0;
+ virtual document_result LessThan(uchar const *) const = 0;
+
+ /**
+ * Full reporting queries
+ *
+ * Result is a vector of pairs <doc id, offset> in some undefined order.
+ */
+ // Data type for results
+ typedef std::vector<std::pair<DocId, TextPosition> > full_result;
+ virtual full_result FullContains(uchar const *) const = 0;
+
+ protected:
+ // Protected constructor; call the static function InitTextCollection().
+ TextCollection() { };
+
+ // No copy constructor or assignment
+ TextCollection(TextCollection const&);
+ TextCollection& operator = (TextCollection const&);
+ };
+}
+#endif
--- /dev/null
+/*
+ * Collection of basic tools and defines
+ */
+
+#include "Tools.h"
+#include <cstdio>
+
+
+time_t Tools::startTime;
+
+void Tools::StartTimer()
+{
+ startTime = time(NULL);
+}
+
+double Tools::GetTime()
+{
+ time_t stopTime = time(NULL);
+ return difftime( stopTime, startTime );
+}
+
+uchar * Tools::GetRandomString(unsigned min, unsigned max, unsigned &alphabetSize)
+{
+ unsigned len = std::rand() % (max - min) + min;
+ alphabetSize = std::rand() % 26 + 1;
+ uchar* temp = new uchar[len + 2];
+ for (unsigned i = 0; i < len; i++)
+ temp[i] = 97 + std::rand() % alphabetSize;
+ temp[len] = 0u ;temp[len+1] = '\0';
+ return temp;
+}
+
+
+void Tools::PrintBitSequence(ulong *A, ulong len)
+{
+ for(ulong i = 0; i < len; i++)
+ if (GetField(A, 1, i))
+ std::cout << "1";
+ else
+ std::cout << "0";
+ std::cout << "\n";
+}
+
+unsigned Tools::FloorLog2(ulong i)
+{
+ unsigned b = 0;
+ if (i == 0)
+ return 0;
+ while (i)
+ {
+ b++;
+ i >>= 1;
+ }
+ return b - 1;
+}
+
+unsigned Tools::CeilLog2(ulong i)
+{
+ unsigned j = FloorLog2(i);
+ if ((ulong)(1lu << j) != i)
+ return j + 1;
+
+ return j;
+}
+
+uchar * Tools::GetFileContents(char *filename, ulong maxSize)
+{
+ std::ifstream::pos_type posSize;
+ std::ifstream file ((char *)filename, std::ios::in|std::ios::binary|std::ios::ate);
+ if (file.is_open())
+ {
+ posSize = file.tellg();
+ ulong size = posSize;
+ if (maxSize != 0 && size > maxSize)
+ size = maxSize;
+ char *memblock = new char [size + 1];
+ file.seekg (0, std::ios::beg);
+ file.read (memblock, size);
+ memblock[size] = '\0';
+ file.close();
+ return (uchar *)memblock;
+ }
+ else
+ return 0;
+}
+
+unsigned Tools::bits (ulong n)
+
+ { unsigned b = 0;
+ while (n)
+ { b++; n >>= 1; }
+ return b;
+ }
+
+
--- /dev/null
+/*
+ * Collection of basic tools and defines
+ */
+
+
+#ifndef _TOOLS_H_
+#define _TOOLS_H_
+
+#include <iostream>
+#include <fstream>
+#include <ctime>
+#include <climits>
+
+// Generates an error if __WORDSIZE is not defined
+#ifndef __WORDSIZE
+#error Missing definition of __WORDSIZE; Please define __WORDSIZE in Tools.h!
+#endif
+
+// Check word length on GNU C/C++:
+// __WORDSIZE should be defined in <bits/wordsize.h>, which is #included from <limits.h>
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+
+#define WW (W*2)
+#define Wminusone (W-1)
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+
+class Tools
+{
+private:
+ static time_t startTime;
+public:
+ static void StartTimer();
+ static double GetTime();
+ static uchar * GetRandomString(unsigned, unsigned, unsigned &);
+ static void PrintBitSequence(ulong *, ulong);
+ static uchar * GetFileContents(char *, ulong =0);
+ static ulong * bp2bitstream(uchar *);
+ static unsigned FloorLog2(ulong);
+ static unsigned CeilLog2(ulong);
+ static unsigned bits (ulong);
+
+ static inline void SetField(ulong *A, register unsigned len, register ulong index, register ulong x)
+ {
+ ulong i = index * len / W,
+ j = index * len - i * W;
+ ulong mask = (j+len < W ? ~0lu << j+len : 0)
+ | (W-j < W ? ~0lu >> (W-j) : 0);
+ A[i] = (A[i] & mask) | x << j;
+ if (j + len > W)
+ {
+ mask = ((~0lu) << (len + j - W));
+ A[i+1] = (A[i+1] & mask)| x >> (W - j);
+ }
+ }
+
+ static inline ulong GetField(ulong *A, register unsigned len, register ulong index)
+ {
+ register ulong i = index * len / W,
+ j = index * len - W * i,
+ result;
+ if (j + len <= W)
+ result = (A[i] << (W - j - len)) >> (W - len);
+ else
+ {
+ result = A[i] >> j;
+ result = result | (A[i+1] << (WW - j - len)) >> (W - len);
+ }
+ return result;
+ }
+
+
+ static inline ulong GetVariableField(ulong *A, register unsigned len, register ulong index)
+ {
+ register ulong i=index/W, j=index-W*i, result;
+ if (j+len <= W)
+ result = (A[i] << (W-j-len)) >> (W-len);
+ else {
+ result = A[i] >> j;
+ result = result | (A[i+1] << (WW-j-len)) >> (W-len);
+ }
+ return result;
+ }
+
+ static inline void SetVariableField(ulong *A, register unsigned len, register ulong index, register ulong x) {
+ ulong i=index/W,
+ j=index-i*W;
+ ulong mask = (j+len < W ? ~0lu << j+len : 0)
+ | (W-j < W ? ~0lu >> (W-j) : 0);
+ A[i] = (A[i] & mask) | x << j;
+ if (j+len>W) {
+ mask = ((~0lu) << (len+j-W));
+ A[i+1] = (A[i+1] & mask)| x >> (W-j);
+ }
+ }
+
+
+};
+
+#endif
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+
+#include "bittree.h"
+
+
+using std::cout;
+using std::endl;
+using std::cerr;
+using std::bitset;
+using std::ostream;
+
+//std::ostream& operator<<(std::ostream& os, node& n) { ... }
+
+
+ BVTree::~BVTree(){
+ delete tempbit;
+ tempbit = 0;
+ }
+
+
+
+/**
+BVTree::BVTree(char *readfile){
+
+ std::ifstream file(readfile);
+ if (!file)
+ {
+ cout << "error reading file "<< readfile <<" !\n";
+ exit(EXIT_FAILURE);
+ }
+
+
+ tempnil = (BVNode*) ((RBTree*) this)->nil;
+ tempbit = new bitset<2*logn>;
+
+
+ char c;
+ bitset<8> bs;
+
+ while (file.get(c))
+ {
+ bs = c;
+
+ for (int i=7; i>=0; i--)
+ appendBit(bs[i]);
+ }
+
+}
+**/
+
+
+
+
+void BVTree::iterateReset(){
+ iterate=1;
+ iterateLocal=0;
+ if (root!=nil) {
+ iterateNode = (BVNode*) treeMinimum(root);
+ iterateRank = ((*iterateNode->block)[0])?1:0;
+ } else {
+ iterateNode=getNil();
+ iterateRank = 0;
+ }
+}
+
+bool BVTree::iterateGetBit(){
+ return ((*iterateNode->block)[iterateLocal]);
+}
+
+ulong BVTree::iterateGetRank(bool bit){
+ if (bit) return iterateRank;
+ else return (iterate - iterateRank);
+}
+
+bool BVTree::iterateNext(){
+ iterate++;
+
+ if (iterate > getPositions()) return false;
+
+
+ if (iterateLocal < iterateNode->myPositions-1) {
+ iterateLocal++;
+ } else {
+ // jump to next leaf;
+ iterateNode = (BVNode*) treeSuccessor(iterateNode);
+ #ifndef NDEBUG
+ if (iterateNode==getNil()) {
+ cout << "iterateNode==getNil()" << endl;
+ return false;
+ }
+ #endif
+ iterateLocal=0;
+ }
+ if ((*iterateNode->block)[iterateLocal]) iterateRank++;
+
+ return true;
+}
+
+ulong BVTree::getPositions(){
+
+ if (getRoot() == getNil()) return 0;
+ return getRoot()->subTreePositions;
+}
+
+ulong BVTree::getRank(){
+
+ if (getRoot() == getNil()) return 0;
+ return getRoot()->subTreeRank;
+}
+
+
+void BVTree::printNode(BVNode *n){
+ int commas=(2*logn)/10;
+ int commashift = -1;
+
+
+ cout << "address: " << n << endl;
+
+ if (n->block != 0 && (n!=getNil()))
+ {
+ int size = 2*logn + 1 +commas;
+
+ char *myblock = (char *) new char[size];
+ if (myblock == 0) {
+ cerr << "error: printNode: myblock == 0" << endl;
+ exit(0);
+ }
+
+ //read: i, write: i+commashift
+ for (int i=0; i<(int)(n->myPositions); i++){
+ if (i%10 == 0) commashift++;
+ myblock[i+commashift]='0' + (*n->block)[i];
+ if (i+commashift >= size-1) {
+ cerr << "X) printNode: array wrong index" << endl;
+ exit(0);
+ }
+
+ }
+
+ cout << "size=" << size << endl;
+ cout << "n->myPositions=" << n->myPositions << endl;
+ for (int i=n->myPositions; i<(2*logn); i++){
+ if ((i%10) == 0) commashift++;
+
+ myblock[i+commashift]='-';
+ if (i+commashift > size-2) {
+ cerr << "A) printNode: array wrong index" << endl;
+ exit(0);
+ }
+ }
+
+ commashift = 0;
+ for (int i=10; i < 2*logn; i++){
+ if (i%10 == 0)
+ {
+ if (i+commashift >= size-2) {
+ cerr << "B) printNode: array wrong index" << endl;
+ exit(0);
+ }
+
+ myblock[i+commashift]=',';
+ commashift++;
+ }
+
+ }
+
+ myblock[size - 1]='\0';
+
+ cout << "block: \"" << myblock << "\"" << endl;
+ delete myblock;
+
+ }
+ else
+ cout << "block: none" << endl;
+
+
+ cout << "myPositions: " << n->myPositions << endl;
+ cout << "myRank: " << n->myRank << endl;
+ cout << "subTreePositions: " << n->subTreePositions << endl;
+ cout << "subTreeRank: " << n->subTreeRank << endl;
+ cout << "color: " << ((n->color==RED)?"RED":"BLACK") << endl;
+ cout << "parent: " << n->getParent() << endl;
+ cout << "left: " << n->getLeft() << endl;
+ cout << "right:" << n->getRight() << endl << endl;
+
+}
+
+
+int BVTree::getTreeMaxDepth(){
+ return getNodeMaxDepth(root);
+}
+
+int BVTree::getTreeMinDepth(){
+ return getNodeMinDepth(root);
+}
+
+
+void BVTree::updateCounters(BVNode *n){
+
+ if (n == getNil()) return;
+
+ ulong lR = 0;
+ ulong lP = 0;
+ ulong rR = 0;
+ ulong rP = 0;
+
+ if (n->getLeft() != getNil()) {
+ lR = n->getLeft()->subTreeRank;
+ lP = n->getLeft()->subTreePositions;
+ }
+
+ if (n->getRight() != getNil()) {
+ rR = n->getRight()->subTreeRank;
+ rP = n->getRight()->subTreePositions;
+ }
+
+ n->subTreeRank =lR + rR + n->myRank;
+ n->subTreePositions=lP + rP + n->myPositions;
+
+}
+
+ulong BVTree::getLocalRank(BVNode* n, ulong position){
+
+ #ifndef NDEBUG
+ if (position > n->myPositions) {
+ cerr << "error: getLocalRank: invalid position in block.\n";
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ *tempbit =*(n->block)<<((2*logn)-position);
+ return tempbit->count();
+
+ // old version:
+ //rank = 0;
+ //for (ulong i = 0; i < position; i++) {
+ // if ((*n->block)[i]) rank++;
+ //}
+}
+
+ulong BVTree::getLocalSelect1(BVNode* n, ulong query){
+ ulong select =0;
+ #ifndef NDEBUG
+ if (query > n->myPositions) {
+ cerr << "error: getLocalSelect1: invalid position in block.\n";
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ ulong i;
+ for (i = 0; select < query; i++) { // TODO is there a faster solution similar to rank ?
+ if ((*n->block)[i]) select++;
+ }
+
+ return i;
+}
+
+ulong BVTree::getLocalSelect0(BVNode* n, ulong query){
+ ulong select =0;
+ #ifndef NDEBUG
+ if (query > n->myPositions) {
+ cerr << "error: getLocalSelect0: invalid position in block.\n";
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ ulong i;
+ for (i = 0; select < query; i++) {
+ if (!(*n->block)[i]) select++;
+ }
+
+ return i;
+}
+
+
+void BVTree::printNode(ulong i){
+ BVNode* x = getRoot();
+
+ #ifndef NDEBUG
+ if (x == getNil()) {
+ cerr << "error: printNode(int i): root=NULL.\n";
+ exit(EXIT_FAILURE);
+
+ }
+
+ if (i > x->myPositions) {
+ cerr << "error: printNode(int i): invalid position in block.\n";
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ ulong lP=0;
+ bool search = true;
+ //find the corresponding block:
+ while (search)
+ {
+
+ lP = x->getLeft()->subTreePositions;
+
+
+ if (lP >= i)
+ {
+ x=x->getLeft();
+ }
+ else if (lP+x->myPositions >= i){
+ i-=lP;
+ search = false;
+ }
+ else{
+ i-=(lP+x->myPositions);
+ x=x->getRight();
+ }
+ }
+
+
+ cout << "i=" << i << endl;
+ printNode(x);
+}
+
+
+bool BVTree::operator[](ulong i){
+
+ BVNode* x = getRoot();
+ ulong lsP=0;
+ bool search = true;
+ //find the corresponding block:
+ while (search)
+ {
+
+ lsP = x->getLeft()->subTreePositions;
+
+ if (lsP >= i)
+ {
+ #ifndef NDEBUG
+ if (x->getLeft()==getNil()) {
+ cout << "lsP: " << lsP << endl;
+ printNode(x);
+ cout << "ihhh" << endl;
+ checkTree();
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ x=x->getLeft();
+ }
+ else if (lsP+x->myPositions >= i){
+ i-=lsP;
+ search = false;
+ }
+ else{
+ i-=(lsP+x->myPositions);
+ #ifndef NDEBUG
+ if (x->getRight()==getNil()) {
+ cout << "i: " << i << endl;
+ cout << "lsP: " << lsP << endl;
+ cout << "x->myPositions: " << x->myPositions << endl;
+ printNode(x);
+ cout << "ihhh" << endl;
+ checkTree();
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ x=x->getRight();
+ }
+ }
+
+ return (*x->block)[i-1];
+}
+
+
+ulong BVTree::rank1(ulong i){
+ BVNode* x = getRoot();
+
+ if (i == this->getPositions() + 1) i--;
+ #ifndef NDEBUG
+ if (i > this->getPositions() ) {
+ cerr << "error: rank1(0): invalid position in bittree: " << i << endl;
+ cerr << "error: rank1(0): this->getPositions(): " << this->getPositions()<< endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+
+ ulong lsP=0;
+ ulong lsR=0;
+ ulong rank=0;
+ bool search = true;
+ //find the corresponding block:
+ while (search)
+ {
+
+ lsP = x->getLeft()->subTreePositions;
+ lsR = x->getLeft()->subTreeRank;
+
+ if (lsP >= i)
+ {// cout << "L" << endl;
+ x=x->getLeft();
+ }
+ else if (lsP+x->myPositions >= i){
+ i-=lsP;
+ rank+=lsR;
+ search = false;
+ }
+ else{//cout << "R" << endl;
+ i-=(lsP+x->myPositions);
+ rank+=(lsR+x->myRank);
+ x=x->getRight();
+ }
+ }
+
+ rank+=getLocalRank(x, i);
+ return rank;
+}
+
+ulong BVTree::rank0(ulong i){
+ if (this->getPositions() == 0) return 0;
+ return (i-rank1(i));
+}
+
+ulong BVTree::select1(ulong i){
+ BVNode* x = getRoot();
+ ulong select=0;
+ #ifndef NDEBUG
+ if (i > x->subTreeRank ) {
+ cerr << "error: select1: invalid position in bittree: " << i << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ ulong lsP=0;
+ ulong lsR=0;
+ bool search = true;
+ //find the corresponding block:
+ while (search)
+ {
+ // update subTree-counters
+
+
+ lsP = x->getLeft()->subTreePositions;
+ lsR = x->getLeft()->subTreeRank;
+
+ if (lsR >= i) {
+ x=x->getLeft();
+ }
+ else if (lsR+x->myRank >= i) {
+ i-=lsR;
+ select+=lsP;
+ search = false;
+ }
+ else {
+ i-=(lsR+x->myRank);
+ select+=(lsP+x->myPositions);
+ x=x->getRight();
+ }
+ }
+ select+=getLocalSelect1(x, i);
+
+
+
+ return select;
+}
+
+ulong BVTree::select0(ulong i){
+ BVNode* x = getRoot();
+ ulong select=0;
+
+ #ifndef NDEBUG
+ if (i > (x->subTreePositions - x->subTreeRank) || i < 0) {
+ cerr << "error: select1: invalid position in bittree: " << i << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+
+ ulong lsP=0;
+ ulong lsR=0;
+ ulong lmR=0;
+ ulong lmP=0;
+ bool search = true;
+ //find the corresponding block:
+ while (search)
+ {
+ // update subTree-counters
+
+
+ lsP = x->getLeft()->subTreePositions;
+ lsR = x->getLeft()->subTreeRank;
+ lmR = x->getLeft()->myRank;
+ lmP = x->getLeft()->myPositions;
+
+ if (lsP-lsR >= i)
+ {
+ x=x->getLeft();
+ }
+ else if ((lsP-lsR)+(x->myPositions-x->myRank) >= i){
+ i-=lsP-lsR;
+ select+=lsP;
+ search = false;
+ }
+ else{
+ i-=((lsP-lsR)+(x->myPositions-x->myRank));
+ select+=(lsP+x->myPositions);
+ x=x->getRight();
+ }
+ }
+
+ select+=getLocalSelect0(x, i);
+
+ return select;
+}
+
+
+void BVTree::updateCountersOnPathToRoot(BVNode *walk){
+
+ while (walk != getNil()) {
+ updateCounters(walk);
+ walk=walk->getParent();
+ }
+}
+
+//deletes the BVNode and all its children, destroys reb-black-tree property!
+void BVTree::deleteNode(BVNode *n){
+ if (n==getNil() || n == 0) return;
+
+ if (n->getLeft() != getNil() && n->getLeft()) deleteNode(n->getLeft());
+ if (n->getRight() != getNil() && n->getRight()) deleteNode(n->getRight());
+
+ delete n;
+}
+
+
+
+
+// TODO improve by returning bitvalue
+
+void BVTree::deleteBit(ulong i){
+ ulong old_i = i;
+ BVNode* x = getRoot();
+ bool bit;
+ ulong rank=0;
+
+ #ifndef NDEBUG
+ if (x == getNil()) {
+ cerr << "error: deleteBit, root is empty\n"; //shouldn't happen
+ exit(EXIT_FAILURE);
+ }
+
+ if (i > x->subTreePositions || i < 1)
+ {
+ cerr << "error: A, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ ulong lsP=0;
+ ulong lsR=0;
+
+ bool search = true;
+ //find the corresponding block:
+ while (search)
+ {
+ // update of pointers is not yet possible: call updateCountersOnPathToRoot
+
+ lsP = x->getLeft()->subTreePositions;
+ lsR = x->getLeft()->subTreeRank;
+
+
+ if (lsP >= i)
+ {
+ #ifndef NDEBUG
+ if (x->getLeft()==getNil()) exit(EXIT_FAILURE);
+ #endif
+ x=x->getLeft();
+ }
+ else if (lsP+x->myPositions >= i){
+ i-=lsP;
+ rank+=lsR;
+ search = false;
+ }
+ else{
+ i-=(lsP+x->myPositions);
+ rank+=(lsR+x->myRank); // for speedup!
+ #ifndef NDEBUG
+ if (x->getRight()==getNil()) exit(EXIT_FAILURE);
+ #endif
+ x=x->getRight();
+ }
+ }
+
+
+
+ // now delete the bit from the block x:
+ bit =(*x->block)[i-1];
+
+ // store bit and rank information for speedup
+ lastBitDeleted=bit;
+ rank+=getLocalRank(x, i);
+ lastRank=(bit?rank:old_i-rank);
+
+ #ifndef NDEBUG
+ if (i > x->myPositions) {
+ cerr << "error: B, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ bitset<2*logn> mask;
+
+
+ if ( i > 1 )
+ {
+ mask.set();
+ mask>>=(2*logn - i + 1);
+ mask &= *(x->block);
+ (*(x->block)>>=i)<<=i-1; // shift bits by one
+ (*x->block)|=mask; // restore bits in front of deleted bit
+ } else {
+ *(x->block)>>=1; // shift bits by one
+ }
+
+ x->myPositions--;
+ if (bit) x->myRank--;
+
+ updateCountersOnPathToRoot(x);
+
+
+ if (x->myPositions == 0){ // if merging was not possible:
+
+ rbDelete(x, callUpdateCountersOnPathToRoot);
+ delete x;
+
+ } else if (x->myPositions <= logn/2) // merge (and rotate), if necessary:
+ {
+
+ BVNode *sibling = (BVNode*) treeSuccessor(x);
+ //cout << "try to merge -----------------------------------------" << endl;
+ if (sibling != getNil())
+ {
+ if (x->myPositions + sibling->myPositions < 2*logn) //merge !
+ {
+ //cout << "merge with right sibling -----------------------------------------" << endl;
+ // move bits from sibling to x:
+
+ (*sibling->block)<<=x->myPositions;
+ (*x->block)|=(*sibling->block);
+
+ x->myPositions+=sibling->myPositions;
+ x->myRank+=sibling->myRank;
+ updateCountersOnPathToRoot(x);
+ rbDelete(sibling, callUpdateCountersOnPathToRoot);
+ delete sibling;
+ }
+ else sibling = getNil(); //to try the left sibling!
+ }
+ else if ((sibling= (BVNode*) treePredecessor(x)) != getNil())
+ {
+ if (x->myPositions + sibling->myPositions < 2*logn) //merge !
+ {
+ // move bits from sibling to x: (more complicated, needs shift!)
+ //cout << "merge with left sibling -----------------------------------------" << endl;
+ (*x->block)<<=sibling->myPositions;
+ x->myPositions+=sibling->myPositions;
+
+ (*x->block)|=(*sibling->block);
+
+ x->myRank+=sibling->myRank;
+ updateCountersOnPathToRoot(x);
+ rbDelete(sibling, callUpdateCountersOnPathToRoot);
+ delete sibling;
+ }
+
+
+
+ } // end else if
+ } // end else if
+
+
+
+}
+
+void BVTree::writeTree(char *writefile){
+ std::ofstream write(writefile);
+ if (!write)
+ {
+ cerr << "error: Error writing file: " << writefile<< "." << endl;
+ exit(EXIT_FAILURE);
+ }
+ writeTree(write);
+}
+
+
+ulong* BVTree::getBits(){
+ BVNode *n = getRoot();
+ ulong blockCounter=0;
+ ulong len = getPositions();
+
+ int W = sizeof(ulong)*8;
+
+
+ ulong bitsLength = len/W + ((len%W==0)?0:1);
+
+ ulong *bits = new ulong[bitsLength];
+
+ ulong i=0; //0 .. bitsLength-1
+ int j=0; //0 .. W-1
+
+ ulong mask_LeftBitSet = 1;
+
+ mask_LeftBitSet <<= W-1;
+
+ n=(BVNode*) treeMinimum(root);
+ while (n != getNil()) {
+ #ifndef NDEBUG
+ if (n->block == 0) {
+ cerr << "getBits(): block not found !!!" << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ for (blockCounter=0; blockCounter < n->myPositions; blockCounter++) {
+ if (j == W) {
+ i++;
+ j=0;
+ }
+ bits[i] >>= 1; // set 0
+
+ if ((*n->block)[blockCounter]) bits[i] |= mask_LeftBitSet; // set 1
+ j++;
+ }
+ n=(BVNode*) treeSuccessor(n);
+ }
+
+ if (i != bitsLength-1) {
+ cout << "last index is wrong" << endl;
+ exit(EXIT_FAILURE);
+ }
+ while (j < W) {
+ bits[i] >>= 1;
+ j++;
+ }
+
+ return bits;
+}
+
+void BVTree::writeTree(ostream& stream){
+ BVNode *x;
+ char c=0;
+ ulong cCounter=0;
+ ulong blockCounter=0;
+
+ x = (BVNode*) treeMinimum(getRoot());
+
+ while (x != getNil()){
+ blockCounter=x->myPositions;
+ while (blockCounter > 0)
+ {
+ while (blockCounter > 0 && cCounter < 8 )
+ {
+ c<<=1;
+ if ((*x->block)[x->myPositions - blockCounter]) c++;
+ cCounter++;
+ blockCounter--;
+ }
+ if (cCounter == 8)
+ {
+ stream << c;
+ cCounter = 0;
+ }
+ }
+
+
+ x = (BVNode*) treeSuccessor(x);
+ }
+
+ if (cCounter != 0) {
+ while (cCounter < 8 )
+ {
+ c<<=1; // fill with zeros.
+ cCounter++;
+ }
+ stream << c;
+ }
+}
+
+
+void BVTree::appendBit(bool bit){
+ ulong pos = 1;
+ if (root != getNil()) pos= getRoot()->subTreePositions + 1;
+
+ insertBit(bit, pos);
+}
+
+void BVTree::insertBit(bool bit, ulong i){
+ if (getRoot() == getNil())
+ {
+ BVNode *newNode;
+ newNode = new BVNode(getNil());
+ newNode->color=BLACK;
+ bitset<2*logn> *newBlock;
+ newBlock=new bitset<2*logn>;
+ newNode->block = newBlock;
+ setRoot(newNode);
+
+ }
+
+ BVNode* x = getRoot();
+
+ #ifndef NDEBUG
+ if (i > x->subTreePositions+1)
+ {
+ printNode(x);
+ cerr << "error: insertBit: position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen
+ exit(EXIT_FAILURE);
+ }
+ if (i < 1)
+ {
+ cerr << "error: insertBit: position " << i <<" is not valid.\n"; //shouldn't happen
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ ulong lsP=0;
+
+
+ while (true)
+ {
+ // update subTree-counters
+ (x->subTreePositions)++;
+ if (bit) (x->subTreeRank)++;
+
+ lsP = x->getLeft()->subTreePositions;
+
+ if (lsP >= i)
+ {// cout << "A" << endl;
+ x=x->getLeft();
+ }
+ else if (lsP+x->myPositions >= i-1){ //-1 to append to last position in current node
+ i-=lsP;
+ break;
+ }
+ else{
+ i-=(lsP+x->myPositions);
+ x=x->getRight();
+ }
+ }
+
+ // now put the bit into the block x:
+ bitset<2*logn> mask;
+
+ if (x->myPositions > 0)
+ {
+ if (i != 1)
+ {
+ mask.set();
+ mask>>=2*logn - i +1; // make 1's at all positions < i
+ mask &= *(x->block); // store bits in front of new bit
+ (*(x->block)>>=i-1)<<=i; // shift bits by one
+ (*x->block)|=mask; // restore bits in front of new bit
+ }
+ else
+ (*x->block)<<=1;
+ }
+
+ (*x->block)[i-1]=bit; // set new bit
+
+ // update local pointers
+ (x->myPositions)++; //update p (leaf)
+ if (bit) (x->myRank)++; //update r (leaf)
+
+ #ifndef NDEBUG
+ if ((int)x->myPositions > 2*logn)
+ {
+ cerr << "error: positions in block already too many.\n"; //shouldn't happen
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ // split and rotate, if necessary
+ if ((int)x->myPositions == 2*logn)
+ {
+ //cout << "split !-------------" << endl;
+
+ // new node:
+ BVNode *newNode;
+ newNode = new BVNode(getNil());
+
+ //find place for new node:
+ BVNode *y;// some parent of new node
+ if (x->getRight() != getNil()) {
+ y = (BVNode*) this->treeMinimum(x->getRight());
+ y->setLeft(newNode);
+ } else {
+ y=x;
+ y->setRight(newNode);
+ }
+ newNode->setParent(y);
+
+ //new block:
+ bitset<2*logn> *newBlock;
+ newBlock=new bitset<2*logn>;
+ *newBlock=*x->block>>logn; //copy of bits into the new block
+
+ mask.set();
+ mask>>=logn;
+ *x->block &= mask; //delete bits that already have been copied to the new block
+
+ newNode->block=newBlock;
+ newNode->myRank=(*newNode->block).count();
+ newNode->myPositions=logn;
+ newNode->color=RED;
+
+ //update old node x:
+ x->myRank-=newNode->myRank; //=(*x->block).count();
+ x->myPositions=logn;
+
+ updateCountersOnPathToRoot(newNode); // meets x
+ rbInsertFixup(newNode, callUpdateCounters);
+
+ } // end if
+
+
+}
+
+void BVTree::checkSubTree(BVNode *n){
+ ulong lP = 0;
+ ulong lR = 0;
+ ulong rP = 0;
+ ulong rR = 0;
+
+ if (n->getLeft()!=getNil()) {
+ lP = n->getLeft()->subTreePositions;
+ lR = n->getLeft()->subTreeRank;
+ if (n->getLeft()->getParent() != n) {
+ cout << "au"<< endl;
+ exit(1);
+ }
+
+ }
+
+ if (n->getRight()!=getNil()) {
+ rP = n->getRight()->subTreePositions;
+ rR = n->getRight()->subTreeRank;
+ if (n->getRight()->getParent() != n) {
+ cout << "au"<< endl;
+ exit(1);
+ }
+ }
+
+ if ( (n->subTreePositions != (n->myPositions + lP + rP)) ||
+ (n->subTreeRank != (n->myRank + lR + rR)) ){
+ cout << "checkSubTree: error" <<endl;
+ cout << "lP: " << lP << endl;
+ cout << "lR: " << lR << endl;
+ cout << "rP: " << rP << endl;
+ cout << "rR: " << rR << endl;
+ cout << "n->myPositions + lP + rP: " << n->myPositions + lP + rP << endl;
+ cout << "n->myRank + lR + rR: " << n->myRank + lR + rR << endl;
+ printNode(n);
+ printNode(n->getLeft());
+ printNode(n->getRight());
+ exit(1);
+ }
+
+ if (n->getLeft()!=getNil()) checkSubTree(n->getLeft());
+ if (n->getRight()!=getNil()) checkSubTree(n->getRight());
+}
+
+void callUpdateCounters(RBNode *n, RBTree *T){
+ ((BVTree*)T)->updateCounters((BVNode*) n);
+}
+
+void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){
+ ((BVTree*)T)->updateCountersOnPathToRoot((BVNode*) n);
+}
+
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+// Implementation of the Dynamic Bit Vector with Indels problem
+// space: O(n) time: O(log(n))
+// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
+// Indexes. CPM 2006, Chapter 3.4 Dynamic Structures for Bit Vectors
+// also: W.-L. Chan, W.-K. Hon, and T.-W. Lam. Compressed index for a dynamic collection
+// of texts. In Proc. CPM04, LNCS 3109, pages 445-456, 2004
+
+#ifndef BVTree
+#define BVTree BVTree
+
+#include <iostream>
+#include <bitset>
+#include <cstdlib>
+#include <map>
+#include <stack>
+#include <cmath>
+#include <fstream>
+#include <vector>
+#include <cstdio>
+
+#include "rbtree.h"
+
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+
+#ifndef LOGN
+#define LOGN 64
+#endif
+
+const int logn = LOGN;
+
+//upperBound = 2 * logn;
+//lowerBound = logn / 2;
+
+
+class BVNode;
+class BVTree;
+
+void callUpdateCounters(RBNode *n, RBTree *T);
+void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T);
+
+class BVNode : public RBNode
+{
+ public:
+ ulong myPositions; // 4*4 bytes = 16 bytes
+ ulong myRank;
+ ulong subTreePositions; //number of positions stored in the subtree rooted at this node
+ ulong subTreeRank; //number of bits set in the subtree rooted at this node
+
+ std::bitset<2*logn> *block; // 4 bytes
+
+
+ BVNode()
+ : RBNode(this), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) {
+ }
+
+
+ BVNode(BVNode* n)
+ : RBNode(n), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) {
+ }
+
+ ~BVNode(){
+ delete block;
+ block = 0;
+ }
+
+
+ BVNode* getParent(){
+ return ((BVNode*) ((RBNode*) this)->parent);
+ }
+
+ BVNode* getLeft(){
+ return ((BVNode*) ((RBNode*) this)->left);
+ }
+
+ BVNode* getRight(){
+ return ((BVNode*) ((RBNode*) this)->right);
+ }
+
+ void setParent(BVNode* n){
+ ((RBNode*) this)->parent=(RBNode*)n;
+ }
+
+ void setLeft(BVNode* n){
+ ((RBNode*) this)->left=(RBNode*)n;
+ }
+
+ void setRight(BVNode* n){
+ ((RBNode*) this)->right=(RBNode*)n;
+ }
+
+};
+
+
+class BVTree : public RBTree{
+public:
+
+ //Constructors
+ BVTree(){
+
+ setNil(new BVNode());
+ setRoot(getNil());
+
+ tempnil = getNil();
+
+ tempbit = new std::bitset<2*logn>;
+ }
+
+ //Destructor:
+ ~BVTree();
+
+ bool operator[](ulong);
+
+
+ // inserts bit at position i, countings begins with 1:
+ void appendBit(bool bit);
+ void insertBit(bool bit, ulong i);
+ void deleteBit(ulong i);
+
+ ulong rank0(ulong i);
+ ulong rank1(ulong i);
+ ulong rank(bool b, ulong i){return b?rank1(i):rank0(i);}
+
+ ulong select0(ulong i);
+ ulong select1(ulong i);
+ ulong select(bool b, ulong i){return b?select1(i):select0(i);}
+
+ void setRoot(BVNode* n){
+ ((RBTree*) this)->root=(RBNode*)n;
+ }
+
+ BVNode* getRoot(){
+ return ((BVNode*) ((RBTree*) this)->root);
+ }
+
+ void setNil(BVNode* n){
+ tempnil = n;
+ ((RBTree*) this)->nil=(RBNode*)n;
+ }
+
+ BVNode* getNil(){
+ return tempnil;
+
+ }
+
+ // write bits back into a stream:
+ ulong* getBits();
+ void writeTree(char *writefile);
+ void writeTree(std::ostream& stream); //e.g. stream = cout
+
+ int getTreeMaxDepth();
+ int getTreeMinDepth();
+ ulong getPositions();
+ ulong getRank();
+
+ void iterateReset();
+ bool iterateGetBit();
+ bool iterateNext();
+ ulong iterateGetRank(bool bit);
+
+ bool getLastBitDeleted(){return lastBitDeleted;}
+ ulong getLastRank(){return lastRank;}
+
+ void checkSubTree(BVNode *n);
+
+ void updateCounters(BVNode *n);
+ void updateCountersOnPathToRoot(BVNode *walk);
+
+ //debug:
+ void printNode(ulong i);
+
+protected:
+
+ ulong iterate;
+ ulong iterateLocal;
+ ulong iterateRank;
+ BVNode *iterateNode;
+
+ BVNode *tempnil;
+
+ bool lastBitDeleted;
+ ulong lastRank;
+
+
+ std::bitset<2*logn> *tempbit;
+
+ // content of BVNode, for debugging:
+ void printNode(BVNode *n);
+
+ // other operations:
+ ulong getLocalRank(BVNode* n, ulong position);
+ ulong getLocalSelect0(BVNode* n, ulong query);
+ ulong getLocalSelect1(BVNode* n, ulong query);
+
+ void deleteNode(BVNode *n);
+ void deleteLeaf(BVNode *leaf);
+};
+
+
+#endif
+
--- /dev/null
+BitRank.o: BitRank.cpp BitRank.h Tools.h
+CSA.o: CSA.cpp CSA.h BitRank.h Tools.h dynFMI.h bittree.h rbtree.h \
+ handle.h pos.h
+TextCollection.o: TextCollection.cpp TextCollection.h Tools.h
+Tools.o: Tools.cpp Tools.h
+bittree.o: bittree.cpp bittree.h rbtree.h
+dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h handle.h pos.h
+handle.o: handle.cpp handle.h rbtree.h pos.h
+pos.o: pos.cpp pos.h rbtree.h handle.h
+rbtree.o: rbtree.cpp rbtree.h
+testTextCollection.o: testTextCollection.cpp CSA.h BitRank.h Tools.h \
+ dynFMI.h bittree.h rbtree.h handle.h pos.h TextCollection.h
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+
+
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <stack>
+#include <queue>
+#include <functional>
+#include <algorithm>
+
+#include "dynFMI.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::ios;
+using std::priority_queue;
+using std::vector;
+
+
+
+// -------- DynFMI --------
+
+DynFMI::~DynFMI(){
+ deleteDynFMINodes(root);
+ root = 0;
+ delete[] leaves; //free(leaves) // ???;
+#if SAMPLE!=0
+ delete SampledSAPositionsIndicator;
+#endif
+ delete handle;
+ delete pos;
+#if SAMPLE!=0
+ delete sampledSATree;
+#endif
+}
+
+void DynFMI::deleteDynFMINodes(WaveletNode *n){
+ if (n->right) deleteDynFMINodes(n->right);
+ if (n->left) deleteDynFMINodes(n->left);
+ delete n;
+}
+
+DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
+ initEmptyDynFMI(text, x, n);
+ if (del) delete [] text;
+}
+
+DynFMI::DynFMI(char *readfile){
+
+ std::ifstream file(readfile);
+ if (!file)
+ {
+ cerr << "error reading file: " << readfile << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ file.seekg(0, ios::end);
+ ulong n = file.tellg();
+ file.seekg(0, ios::beg);
+
+ uchar *text = new uchar[n];
+
+ char c;
+ ulong i=0;
+
+
+ while (file.get(c))
+ {
+ text[i]=c;
+ i++;
+ }
+ file.close();
+
+ initEmptyDynFMI(text, 1, n);
+
+
+ delete text;
+
+}
+
+void DynFMI::iterateReset(){
+ iterate = 1;
+ recursiveIterateResetWaveletNode(root);
+}
+
+void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
+ w->bittree->iterateReset();
+
+ if (w->left) recursiveIterateResetWaveletNode(w->left);
+ if (w->right) recursiveIterateResetWaveletNode(w->right);
+}
+
+
+bool DynFMI::iterateNext(){
+ iterate++;
+ return !(iterate > getSize());
+}
+
+uchar DynFMI::iterateGetSymbol(){
+
+ ulong i = iterate;
+ WaveletNode *walk = root;
+ bool bit;
+
+ while (true) {
+
+ bit = walk->bittree->iterateGetBit(); // TODO improve
+ i=walk->bittree->iterateGetRank(bit);
+
+ walk->bittree->iterateNext();
+
+
+ if (bit) { //bit = 1
+ if (walk->right == 0) return walk->c1;
+ walk=walk->right;
+ } else { // bit = 0
+ if (walk->left == 0) return walk->c0;
+ walk=walk->left;
+ }
+
+
+ } // end of while
+
+}
+
+
+uchar* DynFMI::getBWT(){
+ ulong n = root->bittree->getPositions();
+ n++;
+ uchar *text = new uchar[n];
+
+ bool data=true;
+ // old slow version:
+ //for (ulong i=1; i <= root->bittree->getPositions(); i++)
+ // text[i-1]=(*this)[i];
+
+ ulong i = 0;
+
+ iterateReset();
+
+ while (data) {
+ text[i] = iterateGetSymbol();
+ data = iterateNext();
+ i++;
+ }
+ text[n-1]=0;
+
+ return text;
+}
+
+std::ostream& DynFMI::getBWTStream(std::ostream& stream){
+ ulong n = root->bittree->getPositions();
+ n++;
+ //uchar *text = new uchar[n];
+
+ bool data=true;
+ // old slow version:
+ //for (ulong i=1; i <= root->bittree->getPositions(); i++)
+ // text[i-1]=(*this)[i];
+
+ ulong i = 0;
+
+ iterateReset();
+
+ while (data) {
+ //text[i] = iterateGetSymbol();
+ stream << iterateGetSymbol();
+ data = iterateNext();
+ i++;
+ }
+ //text[n-1]=0;
+
+ return stream;
+}
+
+//void DynFMI::printDynFMIContent(ostream& stream){
+// uchar c;
+// for (ulong i=1; i<=getSize(); i++)
+// {
+// c =(*this)[i];
+// if (c==0) c= '#';
+// stream << c;
+// }
+
+
+void DynFMI::deleteLeaves(WaveletNode *node){
+ bool leaf = true;
+
+ if (node->left) {
+ // internal node
+ leaf = false;
+ deleteLeaves(node->left);
+
+ }
+ if (node->right){
+ leaf = false;
+ deleteLeaves(node->right);
+ }
+
+ if (leaf) {
+ // is a leaf, delete it!
+ if (node->parent) {
+ if (node==node->parent->left) node->parent->left=0;
+ else node->parent->right=0;
+ }
+ delete node;
+ }
+}
+
+void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
+ #ifndef NDEBUG
+ if (node == node->left) {
+ cout << "makeCodes: autsch" << endl;
+ exit(0);
+ }
+ #endif
+
+ if (node->left) {
+ makeCodes(code | (0 << bits), bits+1, node->left);
+ makeCodes(code | (1 << bits), bits+1, node->right);
+ } else {
+ codes[node->c0] = code;
+ codelengths[node->c0] = bits+1;
+ }
+}
+
+void DynFMI::appendBVTrees(WaveletNode *node){
+ node->bittree = new BVTree();
+
+ if (node->left) appendBVTrees(node->left);
+ if (node->right) appendBVTrees(node->right);
+}
+
+void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
+ // pointers to the leaves for select
+ leaves = (WaveletNode**) new WaveletNode*[256];
+ for(int j=0; j<256; j++) leaves[j]=0;
+
+
+ ulong i;
+ for(i=0; i < n; i++) {
+ if (text[i]==0) {
+ cerr << "Error: unsigned char 0 is reserved." << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ if (leaves[text[i]]==0) {
+ leaves[text[i]] = new WaveletNode(text[i]);
+ }
+ leaves[text[i]]->weight++;
+ }
+
+ // separation symbol:
+ leaves[0] = new WaveletNode((uchar)0);
+ leaves[0]->weight=x;
+
+ // Veli's approach:
+ priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
+
+
+ for(int j=255; j>=0; j--){
+ if (leaves[j]!=0) {
+ q.push( (leaves[j]) );
+ }
+ codes[j] = 0;
+ codelengths[j] = 0;
+ }
+
+ // creates huffman shape:
+ while (q.size() > 1) {
+
+ WaveletNode *left = q.top();
+ q.pop();
+
+ WaveletNode *right = q.top();
+ q.pop();
+
+ q.push( new WaveletNode(left, right) );
+ }
+
+
+ root = q.top();
+ q.pop();
+
+
+ makeCodes(0,0, root); // writes codes and codelengths
+
+
+ // merge leaves (one leaf represent two characters!)
+ for(int j=0; j<256; j++){
+
+ if (leaves[j]) {
+
+ if (leaves[j]->parent->left==leaves[j]) {
+ leaves[j]->parent->c0=j;
+ } else {
+ leaves[j]->parent->c1=j;
+ }
+ leaves[j]=leaves[j]->parent; // merge
+ }
+ }
+
+ deleteLeaves(root);
+ appendBVTrees(root);
+
+ // array C needed for backwards search
+ for(int j=0; j<256+256; j++) C[j] = 0;
+
+#if SAMPLE!=0
+ this->SampledSAPositionsIndicator = new BVTree();
+ this->sampledSATree = new SampledSATree();
+#endif
+
+ this->pos = new Pos(sampleInterval);
+ this->handle = new Handle();
+
+ pos->handle=this->handle;
+ handle->pos=this->pos;
+
+}
+
+void DynFMI::append(uchar *text, ulong length){
+ ulong i;
+ ulong start = root->bittree->getPositions();
+ for (i = start; i<start+length; i++)
+ insert(text[i-start], i+1);
+}
+
+
+void DynFMI::append(uchar c){
+ insert(c, root->bittree->getPositions()+1);
+}
+
+void DynFMI::insert(uchar c, ulong i){
+ #ifndef NDEBUG
+ if (codelengths[c]==0) {
+ cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+
+ ulong level = 0;
+ ulong code = codes[c];
+
+ bool bit;
+ WaveletNode *walk = root;
+
+ while (walk) {
+
+ bit = ((code & (1u << level)) != 0);
+
+ walk->bittree->insertBit(bit,i); // TODO improve
+ i=walk->bittree->rank(bit, i);
+
+ if (bit) { //bit = 1
+ walk=walk->right;
+ } else { // bit = 0
+ walk=walk->left;
+ }
+
+ level++;
+ } // end of while
+
+ int j = 256+c;
+ while(j>1) {
+ C[j]++;
+ j=binaryTree_parent(j);
+ }
+ C[j]++;
+
+}
+
+void DynFMI::deleteSymbol(ulong i){
+ WaveletNode *walk = root;
+ bool bit;
+ uchar c;
+ while (true) {
+
+ // original slow version:
+ //bit = (*walk->bittree)[i];
+ //old_i = i;
+ //i=walk->bittree->rank(bit, i);
+ //walk->bittree->deleteBit(old_i);
+
+
+ walk->bittree->deleteBit(i);
+ bit=walk->bittree->getLastBitDeleted();
+ i=walk->bittree->getLastRank();
+
+ if (bit) { //bit = 1
+ if (walk->right == 0) {
+ c = walk->c1;
+ break;
+ }
+ walk=walk->right;
+ } else { // bit = 0
+ if (walk->left == 0) {
+ c = walk->c0;
+ break;
+ }
+ walk=walk->left;
+ }
+
+
+ } // end of while
+
+ int j = 256+c;
+ while(j>1) {
+ C[j]--;
+ j=binaryTree_parent(j);
+ }
+ C[j]--;
+
+
+}
+
+
+uchar DynFMI::operator[](ulong i){
+ WaveletNode *walk = root;
+ bool bit;
+
+ while (true) {
+
+ bit = (*walk->bittree)[i]; //TODO improve by reducing
+ i=walk->bittree->rank(bit, i);
+
+ if (bit) { //bit = 1
+ if (walk->right == 0) return walk->c1;
+ walk=walk->right;
+ } else { // bit = 0
+ if (walk->left == 0) return walk->c0;
+ walk=walk->left;
+ }
+
+
+ } // end of while
+ cout << endl;
+}
+
+ulong DynFMI::rank(uchar c, ulong i){
+ if (i==0) return 0;
+
+ ulong level = 0;
+ ulong code = codes[c];
+
+
+ bool bit;
+ WaveletNode *walk = root;
+
+ while (true) {
+
+ bit = ((code & (1u << level)) != 0);
+
+ i=walk->bittree->rank(bit, i);
+ if (bit) { //bit = 1
+ if (walk->right == 0) return i;
+ walk=walk->right;
+ } else { // bit = 0
+ if (walk->left == 0) return i;
+ walk=walk->left;
+ }
+
+ level++;
+ } // end of while
+
+ cerr << "error: DynFMI::rank: left while loop" << endl;
+ exit(EXIT_FAILURE);
+ return 0; //never
+}
+
+ulong DynFMI::select(uchar c, ulong i){
+
+ WaveletNode *walk = leaves[c];
+
+ bool bit = (walk->c1==c);
+
+ while (walk->parent) {
+ i=walk->bittree->select(bit, i);
+
+ bit = (walk == walk->parent->right);
+ walk=walk->parent;
+ } // end of while
+
+ i=walk->bittree->select(bit, i);
+
+ return i;
+}
+
+
+ulong DynFMI::count(uchar *pattern){
+
+ ulong i = strlen((char*)pattern);
+ ulong sp = 1;
+
+ ulong ep = getSize();
+ uchar s;
+ while ((sp <= ep) && (i >=1)) {
+ s=pattern[i-1];
+ sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
+ ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
+
+ i--;
+ }
+ return ep-sp +1;
+}
+
+#if SAMPLE!=0
+ulong* DynFMI::locate(uchar *pattern){
+
+ ulong i = strlen((char*)pattern);
+ ulong sp = 1;
+
+ ulong ep = getSize();
+ uchar s;
+ while ((sp <= ep) && (i >=1)) {
+ s=pattern[i-1];
+ sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
+ ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
+
+ i--;
+ }
+
+ ulong numberOfMatches = ep-sp +1;
+ ulong *matches = new ulong[2*numberOfMatches + 1];
+ matches[0] = numberOfMatches;
+
+ ulong diff;
+ ulong k = 1;
+ cout << "----------------------" << endl;
+ for (ulong j=sp; j <= ep; j++) {
+
+ i=j;
+ diff = 0;
+ while (!(*SampledSAPositionsIndicator)[i]) {
+ diff++;
+ s=(*this)[i];
+ i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
+
+ }
+
+
+ sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
+
+ //cout << "found: (" << sampledSATree->handle << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
+ matches[k++] = sampledSATree->handle;
+ matches[k++] = sampledSATree->sampledSAValue + diff;
+ }
+
+ return matches;
+}
+#endif
+
+// size must include endmarker!
+ulong DynFMI::addText(uchar const * str, ulong n){
+ ulong i;
+#if SAMPLE!=0
+ bool sample = false;
+#endif
+ i=pos->getSize()+1;
+
+ ulong key = pos->appendText(n);
+
+ // Adding an empty text (end-marker)
+ if (n == 1)
+ {
+ insert(str[0], i);
+ //if (del) delete [] str;
+ return key;
+ }
+
+ insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
+#if SAMPLE!=0
+ sample = (n-1 % sampleInterval == 0);
+ SampledSAPositionsIndicator->insertBit(sample, i);
+ if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+ for (ulong t=n-2; t > 0; t--) {
+ i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
+ insert(str[t-1],i);
+#if SAMPLE!=0
+ sample = ((t) % sampleInterval == 0);
+ SampledSAPositionsIndicator->insertBit(sample, i);
+
+ if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+ }
+
+
+ i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
+ insert(str[n-1],i);
+#if SAMPLE!=0
+ SampledSAPositionsIndicator->insertBit(true, i);
+ sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+// if (del) delete [] str;
+
+ return key;
+}
+
+
+ulong DynFMI::addTextFromFile(char* filename, ulong n){
+ ulong i;
+#if SAMPLE!=0
+ bool sample = false;
+#endif
+ i=pos->getSize()+1;
+
+
+ std::ifstream str(filename, ios::binary);
+ if (!str)
+ {
+ cerr << "error reading file: " << filename << endl;
+ exit(EXIT_FAILURE);
+ }
+
+ char c;
+ //char oldc;
+
+
+ ulong buf_len; //1048576; //10MB
+ char * buffer;
+ ulong buf_pos;
+
+ if (BUFFER==0) buf_len = n;
+ else buf_len = BUFFER;
+
+ buffer = new char[buf_len];
+ buf_pos = (n-2) / buf_len;
+ str.seekg(buf_pos*buf_len);
+ str.read(buffer, buf_len);
+ str.clear(std::ios_base::goodbit);
+
+
+ ulong key = pos->appendText(n);
+
+ //str.seekg(n-2);
+
+ //c=str.get();
+ c=buffer[(n-2)%buf_len];
+ insert(c,i); // insert second last character, corresponds to suffix of length 1
+#if SAMPLE!=0
+ sample = (n-1 % sampleInterval == 0);
+ SampledSAPositionsIndicator->insertBit(sample, i);
+ if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+ if ((n-2)%buf_len == 0) {
+ c=buffer[0];
+ buf_pos--;
+ str.seekg(buf_pos*buf_len);
+ str.read(buffer, buf_len);
+ }
+ for (ulong t=n-2; t > 0; t--) {
+
+ i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
+
+ c=buffer[(t-1)%buf_len];
+ insert(c,i);
+
+ //check buffer
+ if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
+#ifndef NDEBUG
+ if (buf_pos == 0) {
+ cerr << "buf_pos too small" << endl;
+ exit(0);
+ }
+#endif
+ buf_pos--;
+ str.seekg(buf_pos*buf_len);
+ str.read(buffer, buf_len);
+ }
+
+#if SAMPLE!=0
+ sample = ((t) % sampleInterval == 0);
+ SampledSAPositionsIndicator->insertBit(sample, i);
+
+ if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+ }
+
+ i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
+ insert(c,i);
+
+#if SAMPLE!=0
+ SampledSAPositionsIndicator->insertBit(true, i);
+ sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+
+ str.close();
+
+ return key;
+}
+
+
+uchar* DynFMI::retrieveText(ulong key){
+ cout << "key: " << key << endl;
+ uchar *text=0;
+ // TODO access two times, too much
+ ulong i=handle->getPos(key); //=bwtEnd;
+ if (i == 0 ) {
+ return text;
+ }
+
+ ulong n=pos->getTextSize(i);
+
+ text = new uchar[n]; // last byte 0 for cout
+
+ uchar c;
+
+//TODO better
+ for (ulong t=n-2; t < n; t--) {
+ c = (*this)[i];
+ text[t]=(c==0?'#':c);
+
+ i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
+ }
+
+ #ifndef NDEBUG
+ for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
+ #endif
+
+ text[n-1] = 0;
+ return text;
+}
+
+
+bool DynFMI::deleteText(ulong key){
+ // TODO access two times, can do better
+ ulong i=handle->getPos(key); //=bwtEnd;
+ if (i == 0) return false;
+
+ ulong n=pos->getTextSize(i);
+ ulong *text = new ulong[n];
+ uchar c;
+
+ for (ulong t=n-1; t <n; t--) {
+ c = (*this)[i];
+ text[t]=i;
+ i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
+ }
+
+
+ std::sort(text, text + n);
+
+
+ for (i=n-1; i<n; i--) {
+ this->deleteSymbol(text[i]);
+#if SAMPLE!=0
+ SampledSAPositionsIndicator->deleteBit(text[i]);
+//TODO samples ?
+#endif
+ }
+ delete[] text;
+ handle->deleteKey(key);
+
+
+ return true;
+}
+
+ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
+ int j = 256+c;
+ ulong r=0;
+ while(j>1) {
+ if (binaryTree_isRightChild(j))
+ r += C[binaryTree_left(binaryTree_parent(j))];
+
+ j=binaryTree_parent(j);
+ }
+ return r;
+}
+
+
+
+
+void DynFMI::printDynFMIContent(std::ostream& stream){
+ uchar c;
+ for (ulong i=1; i<=getSize(); i++)
+ {
+ c =(*this)[i];
+ if (c==0) c= '#';
+ stream << c;
+ }
+}
+
+
+
+
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+// ------ Dynamic Sequence with Indels -----
+// uses huffman shaped wavelet tree
+// space: O(nH(0)) operation time: O(log n log \sigma)
+// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
+// Indexes. CPM 2006, Chapter 3.6
+
+
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <math.h>
+#include <bitset>
+
+#ifndef SAMPLE
+//#define SAMPLE 1024
+#define SAMPLE 0 // No sampling.
+#endif
+
+#ifndef BUFFER
+#define BUFFER 1048576
+#endif
+
+
+#include "bittree.h"
+#include "handle.h"
+#include "pos.h"
+
+#if SAMPLE!=0
+#include "ssatree.h"
+#endif
+
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+
+const ulong sampleInterval = SAMPLE;
+
+class WaveletNode{
+ public:
+
+ WaveletNode *left;
+ WaveletNode *right;
+ WaveletNode *parent;
+ ulong weight; // used only while construction
+ uchar c0; // used also while construction
+ uchar c1;
+
+ BVTree *bittree;
+
+ WaveletNode(uchar c)
+ : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){}
+
+ WaveletNode(WaveletNode *left, WaveletNode *right)
+ : left(left), right(right), parent(0), bittree(0) {
+ weight = left->weight + right->weight;
+ left->parent = this;
+ right->parent = this;
+ }
+
+ ~WaveletNode(){
+ delete bittree;
+ bittree = 0;
+ }
+
+ bool operator>(const WaveletNode &a) const {
+ return (weight > a.weight);
+ }
+
+
+};
+
+namespace std
+{
+
+template<> struct greater<WaveletNode*>
+ {
+ bool operator()(WaveletNode const* p1, WaveletNode const* p2)
+ {
+ if(!p1)
+ return false;
+ if(!p2)
+ return true;
+ return *p1 > *p2;
+ }
+ };
+}
+
+
+class DynFMI{
+ public:
+
+ // read char distribution from string
+ // x=expected number of texts, n=length of text, del=delete text afterwards
+ DynFMI(uchar *text, ulong x, ulong n, bool del);
+ // read char distribution from file
+ DynFMI(char *readfile);
+
+ ~DynFMI();
+
+
+
+ //n=length of str, del=delete text afterwards
+ ulong addText(uchar const * str, ulong n);
+ ulong addTextFromFile(char* filename, ulong n);
+ uchar* retrieveText(ulong text);
+ bool deleteText(ulong key);
+
+ ulong count(uchar *pattern);
+#if SAMPLE!=0
+ ulong* locate(uchar *pattern);
+#endif
+
+ //LF(i)-mapping: C[L[i]]+rank_L[i](L,i)
+ ulong LFmapping(ulong i) {
+ uchar s=(*this)[i];
+ return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i);
+ }
+
+
+ ulong getSize() {return root->bittree->getPositions();}
+ ulong getCollectionSize(){ return pos->getSize();}
+ uchar* getBWT();
+ std::ostream& getBWTStream(std::ostream& stream);
+
+ uchar operator[](ulong i);
+ void printDynFMIContent(std::ostream& stream);
+ ulong* getKeys() {return handle->getKeys(); }
+ ulong getPos(ulong i)
+ { return handle->getPos(i); }
+
+ private:
+ WaveletNode *root;
+ WaveletNode **leaves; // needed for construction and select
+
+ ulong codes[256];
+ int codelengths[256];
+ ulong C[256+256];
+
+#if SAMPLE!=0
+ BVTree *SampledSAPositionsIndicator;
+#endif
+ ulong iterate;
+
+ Handle *handle;
+ Pos *pos;
+#if SAMPLE!=0
+ SampledSATree *sampledSATree;
+#endif
+
+
+ ulong rank(uchar c, ulong i);
+ ulong select(uchar c, ulong i);
+
+ void insert(uchar c, ulong i);
+ void append(uchar c);
+ void append(uchar *text, ulong length);
+ void deleteSymbol(ulong i);
+
+
+
+ // functions
+ ulong getNumberOfSymbolsSmallerThan(uchar c);
+
+ void initEmptyDynFMI(uchar *text, ulong x, ulong n);
+
+ void makeCodes(ulong code, int bits, WaveletNode *node);
+ void deleteLeaves(WaveletNode *node);
+ void appendBVTrees(WaveletNode *node);
+
+ void deleteDynFMINodes(WaveletNode *n);
+
+ //Iterator (public??)
+ void iterateReset();
+ bool iterateNext();
+ uchar iterateGetSymbol();
+ void recursiveIterateResetWaveletNode(WaveletNode *w);
+
+
+ // small help functions
+ double log2(double x){
+ return (log10(x) / log10((double)2));
+ }
+
+ int binaryTree_parent(int i){
+ return (int)floor((double)i/2);
+ }
+
+ int binaryTree_left(int i){
+ return 2*i;
+ }
+
+ int binaryTree_right(int i){
+ return 2*i + 1;
+ }
+
+ bool binaryTree_isLeftChild(int i){
+ return (i%2==(int)0);
+ }
+
+ bool binaryTree_isRightChild(int i){
+ return (i%2==(int)1);
+ }
+
+};
+
+
+
+
--- /dev/null
+
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+#include <iostream>
+#include <cstdlib>
+
+#include "handle.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+
+ulong Handle::getPos(ulong key){
+ HandleNode *n = getKey(key);
+ if (n==0) return 0;
+
+ PosNode *p = n->posNode;
+
+ #ifndef NDEBUG
+ if (p==pos->getNil()) {
+ cerr << "error: Handle::getPos: no PosNode found." << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ return (this->pos->getPos(p));
+}
+
+ulong* Handle::getEndPositions(){
+ ulong num = getSize();
+ ulong i = 0;
+
+ if (this->root == getNil()) return 0;
+
+ ulong *positions= new ulong[num];
+
+
+ HandleNode *n = (HandleNode*) treeMinimum(root);
+ while (n != getNil()) {
+
+ //printf("node: %d", n);
+ positions[i] = this->pos->getPos(n->posNode);
+ i++;
+ n = (HandleNode*) treeSuccessor((RBNode*) n);
+ }
+ return positions;
+}
+
+
+ulong* Handle::getKeys(){
+ ulong num = getSize();
+ ulong i = 0;
+
+ if (this->root == getNil()) return 0;
+
+ ulong *keys= new ulong[num];
+
+
+ HandleNode *n = (HandleNode*) treeMinimum(root);
+ #ifndef NDEBUG
+ HandleNode *oldn;
+ #endif
+ while (n != getNil()) {
+
+ //printf("node: %d", n);
+ keys[i] = n->key;
+ i++;
+ #ifndef NDEBUG
+ oldn = n;
+ #endif
+ n = (HandleNode*) treeSuccessor((RBNode*) n);
+ #ifndef NDEBUG
+ if (n == oldn) {
+ cerr << "old!" << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ }
+ #ifndef NDEBUG
+ if (num != i) {
+ cerr << "num: " << num << endl;
+ cerr << "i: " << i << endl;
+ cerr << "error: Handle::getKeys: key number was not correct." << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ return keys;
+}
+
+void Handle::deleteKey(ulong key){
+
+ HandleNode *n = getKey(key);
+ this->pos->deletePosNode(n->posNode);
+
+ rbDelete( n, callHandleUpdateCountersOnPathToRoot);
+ delete n;
+}
+
+
+
+
+HandleNode* Handle::getKey(ulong key){
+ HandleNode *n= getRoot();
+ while (n != getNil()) {
+ if (n->key == key) return n;
+ if (n->getLeft() != getNil()) {
+ if (key <= n->getLeft()->maxkey) n=n->getLeft();
+ else n=n->getRight();
+ }
+ else n=n->getRight();
+ }
+
+
+ return 0;
+}
+
+void Handle::updateCountersOnPathToRoot(HandleNode *n) {
+ while (n != getNil()) {
+ updateCounters(n);
+
+ n = n->getParent();
+ }
+}
+
+
+void Handle::updateCounters(HandleNode *n) {
+ #ifndef NDEBUG
+ if (n == getNil()) {
+ cerr << "error: Handle::updateCounters" << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ n->subtreeSize=1;
+
+
+ n->subtreeSize += n->getLeft()->subtreeSize;
+
+
+ if ( n->getRight() != getNil()) {
+ n->subtreeSize += n->getRight()->subtreeSize;
+ n->maxkey=n->getRight()->maxkey;
+ } else n->maxkey=n->key;
+
+}
+
+
+HandleNode* Handle::getNewKey(){
+ HandleNode *n= getRoot();
+
+ if (n == getNil()) {
+ //tree is empty
+ HandleNode *newLeaf= new HandleNode(getNil(),1); // 1=smallest key, 0 = error
+ setRoot(newLeaf);
+ ((RBNode*)newLeaf)->color=BLACK;
+
+ return newLeaf;
+ }
+
+ if (n->maxkey == n->subtreeSize) {
+ // tree is full
+ HandleNode *last = (HandleNode*) treeMaximum(n);
+
+
+ HandleNode *newLeaf= new HandleNode(getNil(), n->maxkey+1);
+
+ last->setRight(newLeaf);
+ newLeaf->setParent(last);
+
+ if (newLeaf->getParent() != getNil()) updateCountersOnPathToRoot(newLeaf->getParent());
+
+ rbInsertFixup(newLeaf, callHandleUpdateCounters);
+
+ return newLeaf;
+ }
+
+ HandleNode *newNode;
+ ulong smallerkeys = 0;
+ ulong lmax; //getLeft()->maxkey
+ ulong lsub; //n->getLeft()->subtreeSize
+ while (true) {
+ cout << "search first free key" << endl;
+ // search first free key
+
+ lmax = n->getLeft()->maxkey;
+ lsub = n->getLeft()->subtreeSize;
+
+
+ if (lmax == 0) { // no left child
+ if (smallerkeys+1 < n->key) { // check if it is free
+ newNode= new HandleNode(getNil(), smallerkeys+1);
+ newNode->setParent(n);
+ n->setLeft(newNode);
+ updateCountersOnPathToRoot(n);
+ rbInsertFixup(newNode, callHandleUpdateCounters);
+ return newNode;
+ }
+ } else { //left child exists
+ if ( lmax > (lsub + smallerkeys) ) {
+ // free key at left subtree
+ n=n->getLeft();
+ continue;
+ } else if (lmax + 1 < n->key) { // full left subtree, check if it is free inbetween
+ // insert to predecessor
+ n=(HandleNode*)treePredecessor(n);
+ //found place -> :predeccessor new key = lmax + 1
+ newNode= new HandleNode(getNil(), lmax+1);
+ newNode->setParent(n);
+ n->setRight(newNode);
+ updateCountersOnPathToRoot(n);
+ rbInsertFixup(newNode, callHandleUpdateCounters);
+ return newNode;
+
+ }
+ }
+
+ smallerkeys += 1+lsub;
+
+ if (n->getRight() == getNil()) { // no right child, must be free
+ newNode= new HandleNode(getNil(), smallerkeys+1);
+ newNode->setParent(n);
+ n->setRight(newNode);
+ updateCountersOnPathToRoot(n);
+ rbInsertFixup(newNode, callHandleUpdateCounters);
+ return newNode;
+ } else { //right child exists
+
+ //n=n->getRight();
+ if (n->getRight()->maxkey-smallerkeys == n->getRight()->subtreeSize) { // right subTree is full, insert in front
+ ulong leftMinKey = n->getRight()->maxkey - n->getRight()->subtreeSize;
+ if (leftMinKey -1 > n->key) { //in front there is space
+ //insert new n-key + 1
+ newNode= new HandleNode(getNil(), n->key+1);
+ n=(HandleNode*)treeSuccessor(n);
+ newNode->setParent(n);
+ n->setLeft(newNode);
+ updateCountersOnPathToRoot(n);
+ rbInsertFixup(newNode, callHandleUpdateCounters);
+ return newNode;
+ } else {
+ cerr << "error: Handle::getNewKey: no space ?!? " << endl;
+ exit(EXIT_FAILURE);
+ }
+ } else {
+ n=n->getRight();
+ }
+
+
+ }
+
+ #ifndef NDEBUG
+ if (n==getNil()) {
+ cerr << "error: Handle::getNewKey: (A) something wrong ! " << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ }
+ #ifndef NDEBUG
+ cerr << "error: Handle::getNewKey: (B) something wrong ! " << endl;
+ exit(EXIT_FAILURE);
+ #endif
+ return 0; // error
+}
+
+void callHandleUpdateCounters(RBNode *n, RBTree *T){
+ ((Handle *)T)->updateCounters((HandleNode*) n);
+}
+
+void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){
+ ((Handle *)T)->updateCountersOnPathToRoot((HandleNode*) n);
+}
+
--- /dev/null
+
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+
+#ifndef Handle
+#define Handle Handle
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+#include "rbtree.h"
+#include "pos.h"
+
+
+class Pos;
+class PosNode;
+
+void callHandleUpdateCounters(RBNode *n, RBTree *T);
+void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T);
+
+
+class HandleNode : public RBNode {
+ public:
+ ulong key; //maximum for internal nodes
+ ulong subtreeSize;
+ ulong maxkey;
+ PosNode *posNode;
+
+ HandleNode()
+ : RBNode(this), key(0), subtreeSize(0), maxkey(0){
+ }
+
+
+ HandleNode(HandleNode *n, ulong key)
+ : RBNode(n), key(key), subtreeSize(1), maxkey(key){
+ }
+
+
+ HandleNode* getParent(){
+ return ((HandleNode*) ((RBNode*) this)->parent);
+ }
+
+ HandleNode* getLeft(){
+ return ((HandleNode*) ((RBNode*) this)->left);
+ }
+
+ HandleNode* getRight(){
+ return ((HandleNode*) ((RBNode*) this)->right);
+ }
+
+ void setParent(HandleNode* n){
+ ((RBNode*) this)->parent=(RBNode*)n;
+ }
+
+ void setLeft(HandleNode* n){
+ ((RBNode*) this)->left=(RBNode*)n;
+ }
+
+ void setRight(HandleNode* n){
+ ((RBNode*) this)->right=(RBNode*)n;
+ }
+
+};
+
+class Handle : public RBTree{
+ public:
+ Pos *pos;
+
+ Handle(){
+ setNil(new HandleNode());
+ setRoot(getNil());
+ }
+
+
+ void setRoot(HandleNode* n){
+ ((RBTree*) this)->root=(RBNode*)n;
+ }
+
+ HandleNode* getRoot(){
+ return ((HandleNode*) ((RBTree*) this)->root);
+ }
+
+ void setNil(HandleNode* n){
+ ((RBTree*) this)->nil=(RBNode*)n;
+ }
+
+ HandleNode* getNil(){
+ return ((HandleNode*) ((RBTree*) this)->nil);
+ }
+
+
+ ulong getSize(){
+ return (getRoot()!=getNil())?getRoot()->subtreeSize:0;
+ }
+
+ ulong* getKeys();
+
+ ulong getPos(ulong key);
+ ulong* getEndPositions();
+
+ HandleNode* getKey(ulong key);
+ void deleteKey(ulong key);
+ void updateCountersOnPathToRoot(HandleNode *n);
+ void updateCounters(HandleNode *n);
+
+ HandleNode* getNewKey();
+
+
+
+};
+
+#endif
--- /dev/null
+CC = g++
+CPPFLAGS = -Wall -ansi -pedantic -g
+
+testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o handle.o pos.o rbtree.o dynFMI.o
+
+testTextCollection: $(testTextCollection_obs)
+ $(CC) -o testTextCollection $(testTextCollection_obs)
+
+clean:
+ rm -f core *.o *~ testTextCollection
+
+depend:
+ $(CC) -MM *.cpp *.c > dependencies.mk
+
+include dependencies.mk
--- /dev/null
+
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modquity *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+#include <iostream>
+#include <cstdlib>
+
+#include "pos.h"
+
+using std::cerr;
+using std::endl;
+
+ulong Pos::getPos(PosNode *n){
+ ulong position=0;
+
+ if (n->getLeft() != getNil()) position += n->getLeft()->subtreeSize;
+
+ while (n->getParent() != getNil()) {
+ if (n == n->getParent()->getRight()) {
+ // rightchild
+ if (n->getParent()->getLeft() != getNil()) position += n->getParent()->getLeft()->subtreeSize;
+ position++;
+ }
+
+ n=n->getParent();
+ }
+
+
+ return position+1;
+}
+
+ulong Pos::getTextSize(ulong pos){
+ PosNode *n= getPosNode(pos);
+
+ return n->textSize;
+}
+
+void Pos::deleteText(ulong pos){
+ PosNode *n= getPosNode(pos);
+
+ // current node matches.
+ rbDelete(n, callPosUpdateCountersOnPathToRoot);
+ delete n;
+}
+
+void Pos::deletePosNode(PosNode *n){
+ rbDelete(n, callPosUpdateCountersOnPathToRoot);
+ delete n;
+}
+
+PosNode* Pos::getPosNode(ulong pos){
+ // smallest pos is 1 !
+ pos--;
+ PosNode *n= getRoot();
+ ulong leftTree=0;
+ ulong leftSubTreeSize;
+ while (true) {
+ leftSubTreeSize = n->getLeft()->subtreeSize;
+
+ if (pos == leftTree + leftSubTreeSize ) {
+ // current node matches.
+ return n;
+ } else if (pos < leftTree + leftSubTreeSize){
+ n=n->getLeft();
+ }
+ else {
+ leftTree += leftSubTreeSize +1;
+ n=n->getRight();
+ }
+ }
+
+
+ cerr << "error: Pos::getPosNode: text POS " << pos << " not found!" << endl;
+ exit(EXIT_FAILURE);
+
+ return getNil();
+}
+
+
+ulong Pos::appendText(ulong textSize){
+ PosNode *n= getRoot();
+
+ if (n == getNil()) {
+ PosNode *newLeaf= new PosNode(getNil(), textSize);
+
+ setRoot(newLeaf);
+ ((RBNode*)newLeaf)->color = BLACK;
+
+ HandleNode* hn = handle->getNewKey();
+
+ // connect handleNode and posNode:
+ hn->posNode = newLeaf;
+ newLeaf->handleNode = hn;
+
+ return hn->key;
+ }
+
+ while (n->getRight() != getNil()) {
+ n=n->getRight();
+ }
+
+ // new right child !
+ PosNode *newLeaf= new PosNode(getNil(), textSize);
+
+
+ n->setRight(newLeaf);
+ newLeaf->setParent(n);
+
+ updateCountersOnPathToRoot(newLeaf);
+
+ rbInsertFixup(newLeaf, callPosUpdateCounters);
+
+ HandleNode* hn = handle->getNewKey();
+
+ // connect handleNode and posNode:
+ hn->posNode = newLeaf;
+ newLeaf->handleNode = hn;
+
+ return hn->key;
+}
+
+
+
+
+void Pos::updateCountersOnPathToRoot(PosNode *n) {
+ while (n != getNil()) {
+ updateCounters(n);
+
+ n = n->getParent();
+ }
+}
+
+void Pos::updateCounters(PosNode *n) {
+ n->subtreeSize=1;
+ //n->sumTextSize = n->textSize;
+
+ n->subtreeSize += n->getLeft()->subtreeSize;
+ //n->sumTextSize += n->getLeft()->sumTextSize;
+
+ n->subtreeSize += n->getRight()->subtreeSize;
+ //n->sumTextSize += n->getRight()->sumTextSize;
+}
+
+
+
+void callPosUpdateCounters(RBNode *n, RBTree *T){
+ ((Pos *)T)->updateCounters((PosNode*) n);
+}
+
+void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){
+ ((Pos *)T)->updateCountersOnPathToRoot((PosNode*) n);
+}
+
--- /dev/null
+
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+
+ //TODO check star color of the nodes pos and handle!
+
+#ifndef Pos
+#define Pos Pos
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+#include "rbtree.h"
+#include "handle.h"
+
+class Handle;
+class HandleNode;
+
+void callPosUpdateCounters(RBNode *n, RBTree *T);
+void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T);
+
+class PosNode : public RBNode {
+ public:
+
+ HandleNode *handleNode;
+ ulong subtreeSize;
+ ulong textSize; // size including endmarker!
+ //ulong sumTextSize; // sum of textlength of all texts located in this subtree;
+
+ PosNode() // NIL
+ : RBNode(this),subtreeSize(0),textSize(0) {
+ }
+
+ PosNode(PosNode *n, ulong textSize)
+ : RBNode(n),subtreeSize(1),textSize(textSize) {
+ }
+
+
+ PosNode* getParent(){
+ return ((PosNode*) ((RBNode*) this)->parent);
+ }
+
+ PosNode* getLeft(){
+ return ((PosNode*) ((RBNode*) this)->left);
+ }
+
+ PosNode* getRight(){
+ return ((PosNode*) ((RBNode*) this)->right);
+ }
+
+ void setParent(PosNode* n){
+ ((RBNode*) this)->parent=(RBNode*)n;
+ }
+
+ void setLeft(PosNode* n){
+ ((RBNode*) this)->left=(RBNode*)n;
+ }
+
+ void setRight(PosNode* n){
+ ((RBNode*) this)->right=(RBNode*)n;
+ }
+};
+
+class Pos: public RBTree
+{
+ public:
+
+ Handle* handle;
+
+ ulong textNumber;
+ ulong matchPosition;
+
+ ulong sampleInterval;
+
+ Pos(ulong sampleInterval){
+ setNil(new PosNode());
+ setRoot(getNil());
+ this->sampleInterval=sampleInterval;
+ }
+
+ void setRoot(PosNode* n){
+ ((RBTree*) this)->root=(RBNode*)n;
+ }
+
+ PosNode* getRoot(){
+ return ((PosNode*) ((RBTree*) this)->root);
+ }
+
+ void setNil(PosNode* n){
+ ((RBTree*) this)->nil=(RBNode*)n;
+ }
+
+ PosNode* getNil(){
+ return ((PosNode*) ((RBTree*) this)->nil);
+ }
+
+ ulong getSize(){
+ return (getRoot()!=getNil())?getRoot()->subtreeSize:0;
+ }
+
+
+
+ ulong getPos(PosNode *n);
+
+
+ PosNode* getPosNode(ulong text);
+ ulong getTextSize(ulong pos);
+ void deleteText(ulong pos);
+ void deletePosNode(PosNode *n);
+ ulong appendText(ulong textSize);
+
+ void updateCountersOnPathToRoot(PosNode *n);
+ void updateCounters(PosNode *n);
+
+};
+
+#endif
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+
+#include <iostream>
+#include <cstdlib>
+#include "rbtree.h"
+
+
+using std::cout;
+using std::endl;
+using std::cerr;
+
+
+void RBTree::checkTree(){
+ cout << "start" << endl;
+ this->root->countBlack(0);
+ cout << "stop" << endl;
+}
+
+void RBTree::leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){
+ RBNode* y = x->right;
+ x->right=y->left;
+ if (y->left != nil) y->left->parent=x;
+ y->parent=x->parent;
+ if (x->parent==this->nil)
+ root=y;
+ else {
+ if (x == x->parent->left)
+ x->parent->left=y;
+ else
+ x->parent->right=y;
+ }
+ y->left=x;
+ x->parent=y;
+
+ // update counters of x and y !
+ if (updateNode != 0) {
+ updateNode(x, this);
+ updateNode(y, this);
+ }
+}
+
+void RBTree::rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){
+ RBNode* y = x->left;
+ x->left=y->right;
+ if (y->right != nil) y->right->parent=x;
+ y->parent=x->parent;
+
+
+ if (x->parent==nil)
+ root=y;
+ else {
+ if (x == x->parent->right) {
+ x->parent->right=y;
+ } else {
+ x->parent->left=y;
+ }
+ }
+
+ y->right=x;
+ x->parent=y;
+ if (updateNode != 0) {
+ updateNode(x, this);
+ updateNode(y, this);
+ }
+}
+
+void RBTree::rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T)){
+ RBNode *y;
+
+ if (z->parent==nil) return;
+ while (z->parent->color == RED) {
+ if (z->parent == z->parent->parent->left)
+ {
+ y = z->parent->parent->right;
+
+ if (y->color==RED)
+ {
+ z->parent->color=BLACK; // Case 1a
+ y->color=BLACK; // Case 1a
+ z->parent->parent->color=RED; // Case 1a
+ z=z->parent->parent; // Case 1a
+ }
+ else
+ {
+ if (z==z->parent->right)
+ {
+ z=z->parent; // Case 2a
+ leftRotate(z, updateNode); // Case 2a
+ }
+ z->parent->color=BLACK; // Case 3a
+ z->parent->parent->color=RED; // Case 3a
+ rightRotate(z->parent->parent, updateNode); // Case 3a
+ }
+ }
+ else
+ {
+
+ y = z->parent->parent->left;
+
+ if (y->color==RED)
+ {
+ z->parent->color=BLACK; // Case 1b
+ y->color=BLACK; // Case 1b
+ z->parent->parent->color=RED; // Case 1b
+ z=z->parent->parent; // Case 1b
+ }
+ else
+ {
+ if (z==z->parent->left)
+ {
+ z=z->parent; // Case 2b
+ rightRotate(z, updateNode); // Case 2b
+ }
+ z->parent->color=BLACK; // Case 3b
+ z->parent->parent->color=RED; // Case 3b
+ leftRotate(z->parent->parent, updateNode); // Case 3b
+ }
+ }
+ if (z->parent==nil) return;
+ } //end of while
+
+ root->color=BLACK;
+}
+
+
+void RBTree::rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T)){
+ RBNode *w;
+
+ while ((x != root) && (x->color == BLACK))
+ {
+ if (x == x->parent->left)
+ {
+ w=x->parent->right;
+
+ if (w->color == RED)
+ {
+ w->color=BLACK; // Case 1a
+ x->parent->color=RED; // Case 1a
+ leftRotate(x->parent, updateNode); // Case 1a
+ w=x->parent->right; // Case 1a
+ }
+ if ((w->left->color == BLACK) && (w->right->color == BLACK))
+ {
+ w->color=RED; // Case 2a
+ x=x->parent; // Case 2a
+ }
+ else
+ {
+ if (w->right->color == BLACK)
+ {
+ w->left->color=BLACK; // Case 3a
+ w->color=RED; // Case 3a
+ rightRotate(w, updateNode); // Case 3a
+ w=x->parent->right; // Case 3a
+ }
+ w->color=x->parent->color; // Case 4a
+ x->parent->color=BLACK; // Case 4a
+ w->right->color=BLACK; // Case 4a
+ leftRotate(x->parent, updateNode); // Case 4a
+ x=root; // Case 4a
+ }
+ }
+ else
+ {
+ w=x->parent->left;
+
+ if (w->color == RED)
+ {
+ w->color=BLACK; // Case 1b
+ x->parent->color=RED; // Case 1b
+ rightRotate(x->parent, updateNode); // Case 1b
+ w=x->parent->left; // Case 1b
+ }
+ if ((w->right->color == BLACK) && (w->left->color == BLACK))
+ {
+ w->color=RED; // Case 2b
+ x=x->parent; // Case 2b
+ }
+ else
+ {
+ if (w->left->color == BLACK)
+ {
+ w->right->color=BLACK; // Case 3b
+ w->color=RED; // Case 3b
+ leftRotate(w, updateNode); // Case 3b
+ w=x->parent->left; // Case 3b
+ }
+
+ w->color=x->parent->color; // Case 4b
+ x->parent->color=BLACK; // Case 4b
+ w->left->color=BLACK; // Case 4b
+ rightRotate(x->parent, updateNode); // Case 4b
+
+ x=root; // Case 4b
+ }
+ }
+ } // end of while
+ x->color=BLACK;
+}
+
+// quite similar to Cormen et al
+void RBTree::rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T)){
+ RBNode *y,*x,*y_oldParent;
+ y_oldParent=nil;
+ if (z->left == nil || z->right == nil) {
+ y = z; //z has no or one child, deletion is easy
+ } else {
+ y = treeSuccessor(z);
+ y_oldParent=y->parent;
+ }
+
+ if (y->left != nil) {
+ x=y->left;
+ } else {
+ x=y->right;
+ }
+
+ x->parent=y->parent;
+
+ // cut out y:
+ if (y->parent == nil) {
+ root=x;
+ } else {
+ if (isLeftChild(y)) {y->parent->left = x;}
+ else {y->parent->right = x;}
+ }
+
+ RBNodecolor old_y = y->color;
+ if (y != z) { // 2 children
+ //move y to z's old position and delete z
+ if (root==z) {
+ root=y;
+ } else {
+ if (isLeftChild(z)) z->parent->left = y;
+ else z->parent->right = y;
+ }
+
+ y->parent = z->parent;
+ y->left = z->left;
+ y->right = z->right;
+ y->color = z->color; // don't forget to delete z after rbDelete
+
+ y->left->parent = y;
+ y->right->parent = y;
+
+ }
+
+
+ if (old_y == BLACK) {
+ rbDeleteFixup(x, updateNode);
+ }
+
+ updateNode(y, this);
+ if (y_oldParent!=nil) updateNode(y_oldParent, this);
+
+}
+
+
+RBNode* RBTree::treeSuccessor(RBNode *x){
+ if (x->right != nil) return treeMinimum(x->right);
+
+ RBNode *y = x->parent;
+ while ((y!=nil) && (x == y->right)) {
+ x=y;
+ y=y->parent;
+ }
+ return y;
+}
+
+RBNode* RBTree::treePredecessor(RBNode *x){
+ if (x->left != nil) return treeMaximum(x->left);
+
+ RBNode *y = x->parent;
+ while ((y!=nil) && (x == y->left)) {
+ x=y;
+ y=y->parent;
+ }
+ return y;
+}
+
+
+RBNode* RBTree::treeMinimum(RBNode *x){
+ while (x->left != nil) x=x->left;
+ return x;
+}
+
+RBNode* RBTree::treeMaximum(RBNode *x){
+ while (x->right != nil) x=x->right;
+ return x;
+}
+
+
+bool RBTree::isLeftChild(RBNode *n){
+ #ifndef NDEBUG
+ if (n->parent == nil) {
+ cerr << "error: isLeftChild, no parent." << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ return (n->parent->left == n);
+}
+
+bool RBTree::isRightChild(RBNode *n){
+ #ifndef NDEBUG
+ if ( n->parent == nil) {
+ cerr << "error: isLeftChild, no parent." << endl;
+ exit(EXIT_FAILURE);
+ }
+ #endif
+ return (n->parent->right == n);
+}
+
+
+RBNode* RBTree::findRightSiblingLeaf(RBNode *n){
+ // go up:
+ while (true) {
+ if (n->parent!=nil) {
+ if (n==n->parent->right)
+ n=n->parent;
+ else
+ break;
+ } else
+ return nil;
+ }
+
+ n=n->parent;
+
+ // leftmost leaf in n is the right sibling searched
+ n=n->right;
+
+ // go down:
+ while (n->left != nil) {
+ n=n->left;
+ }
+
+ return n;
+}
+
+RBNode* RBTree::findLeftSiblingLeaf(RBNode *n){
+ // go up:
+ while (true) {
+ if (n->parent!=nil) {
+ if (n==n->parent->left)
+ n=n->parent;
+ else
+ break;
+ } else
+ return nil;
+ }
+
+ n=n->parent;
+
+ // rightmost leaf in n is the left sibling searched
+ n=n->left;
+
+ // go down:
+ while (n->right != nil) {
+ n=n->right;
+ }
+
+ return n;
+}
+
+
+int RBTree::getNodeMaxDepth(RBNode *n) {
+ int l;
+ int r;
+ if (n->left == nil) l=0;
+ else l=getNodeMaxDepth(n->left);
+ if (n->right == nil) r=0;
+ else r=getNodeMaxDepth(n->right);
+
+ return (1+(l>r?l:r));
+}
+
+int RBTree::getNodeMinDepth(RBNode *n) {
+ int l;
+ int r;
+ if (n->left == 0) l=0;
+ else l=getNodeMinDepth(n->left);
+ if (n->right == 0) r=0;
+ else r=getNodeMinDepth(n->right);
+
+ return (1+(l>r?r:l));
+}
+
+
+void RBTree::printSubTree(RBNode *n){
+ cout << ((n->color==BLACK)?"B":"R") << n << " [";
+ if (n->left==nil) cout << "N";
+ else printSubTree(n->left);
+ cout << ",";
+ if (n->right==nil) cout << "N";
+ else printSubTree(n->right);
+ cout << "]";
+}
+
+
+void RBTree::checkSubTree(RBNode *n){
+
+ if (n->left!=nil) {
+ if (n->left->parent != n) {
+ cout << "au"<< endl;
+ exit(1);
+ }
+ checkSubTree(n->left);
+ }
+
+ if (n->right!=nil) {
+ if (n->right->parent != n) {
+ cout << "au"<< endl;
+ exit(1);
+ }
+ checkSubTree(n->right);
+ }
+
+}
+
+
+void RBTree::checkNode(RBNode *n){
+ if (n->left!=nil) {
+ if (n->left->parent != n) {
+ cout << "au"<< endl;
+ exit(1);
+ }
+
+ }
+
+ if (n->right!=nil) {
+ if (n->right->parent != n) {
+ cout << "au"<< endl;
+ exit(1);
+ }
+ }
+}
--- /dev/null
+/***************************************************************************
+ * Copyright (C) 2006 by Wolfgang Gerlach *
+ * No object matches key 'wgerlach'. *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU General Public License as published by *
+ * the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
+ ***************************************************************************/
+
+
+// this is a red-black tree implementation by wolfgang Gerlach based on the algorithm provided by
+// Cormen et al.: Introduction to Algorithms, Second Edition. MIT Press and McGraw-Hill, 2001
+
+
+
+#ifndef RBTree
+#define RBTree RBTree
+
+
+
+
+enum RBNodecolor{BLACK,RED};
+
+
+// generic Red-Black Tree Node:
+class RBNode
+{
+ public:
+ RBNode* parent;
+ RBNode* left;
+ RBNode* right; // 3*4 bytes
+
+ enum RBNodecolor color; // due to structure alignment: 4 bytes !!!
+
+ RBNode(){};
+
+ RBNode(RBNode *n)
+ : parent(n), left(n), right(n){
+ color=RED;
+ }
+
+
+ virtual ~RBNode(){} //adds 4 bytes vtable
+
+ void countBlack(int i){
+ if (this->color == BLACK) i++;
+ if (this->left != this) this->left->countBlack(i);
+ else std::cout << i << ",";
+ if (this->right != this) this->right->countBlack(i);
+ else std::cout << i << ",";
+ }
+};
+
+class RBTree{
+ public:
+
+ RBNode *root;
+ RBNode *nil;
+
+
+ virtual ~RBTree(){
+ // Don't double free!
+ if (root != this->nil)
+ deleteNode(root);
+ root = 0;
+ delete this->nil;
+ this->nil = 0;
+ }
+
+ void checkTree();
+
+ void rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T));
+ void rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T));
+ void rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T));
+ RBNode* findRightSiblingLeaf(RBNode *n);
+ RBNode* findLeftSiblingLeaf(RBNode *n);
+ RBNode* treeSuccessor(RBNode *x);
+ RBNode* treePredecessor(RBNode *x);
+ RBNode* treeMinimum(RBNode *x);
+ RBNode* treeMaximum(RBNode *x);
+
+ bool isLeftChild(RBNode *n);
+ bool isRightChild(RBNode *n);
+
+ int getNodeMaxDepth(RBNode *n);
+ int getNodeMinDepth(RBNode *n);
+
+ void printSubTree(RBNode *n);
+ void checkSubTree(RBNode *n);
+ void checkNode(RBNode *x);
+
+ void deleteNode(RBNode* x){
+ if (x->left!=nil && x->left) deleteNode(x->left);
+ if (x->right!=nil && x->right) deleteNode(x->right);
+ delete x;
+ }
+
+
+ private:
+ void leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T));
+ void rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T));
+
+};
+
+#endif
--- /dev/null
+// Test driver for text collection
+#include <iostream>
+using std::cout;
+using std::endl;
+
+#include "TextCollection.h"
+using SXSI::TextCollection;
+
+void printDocumentResult(TextCollection::document_result dr)
+{
+ TextCollection::document_result::iterator it;
+ printf("document result:");
+ for (it = dr.begin(); it != dr.end(); ++it)
+ printf(" %i", *it);
+ printf("\n");
+ fflush(stdout);
+}
+
+void printFullResult(TextCollection::full_result fr)
+{
+ TextCollection::full_result::iterator it;
+ printf("full result:");
+ for (it = fr.begin(); it != fr.end(); ++it)
+ printf(" (%i, %lu)", (*it).first, (*it).second);
+ printf("\n");
+ fflush(stdout);
+}
+
+int main()
+{
+ uchar *text = (uchar*) "acabab";
+ TextCollection *csa = TextCollection::InitTextCollection(1);
+ csa->InsertText(text);
+ text = (uchar*) "abaca";
+ csa->InsertText(text);
+ text = (uchar*) "abacb";
+ csa->InsertText(text);
+
+ csa->MakeStatic();
+
+ text = csa->GetText(0);
+ cout << "Text 0: \"" << text << "\"" << endl;
+ delete [] text;
+ text = csa->GetText(1);
+ cout << "Text 1: \"" << text << "\"" << endl;
+ delete [] text;
+ text = csa->GetText(2);
+ cout << "Text 2: \"" << text << "\"" << endl;
+ delete [] text;
+
+ text = csa->GetText(2, 2, 4);
+ cout << "Substring of Text 3: \"" << text << "\"" << endl;
+ delete [] text;
+
+ printf("n:o contains: %u\n", csa->CountContains((uchar *)"ac"));
+ printf("n:o suffix: %u\n", csa->CountSuffix((uchar *)"b"));
+ printf("n:o equal: %u\n", csa->CountEqual((uchar *)"acabab"));
+ printf("is equal: %u\n", csa->IsEqual((uchar *)"abacb"));
+
+ TextCollection::document_result dr;
+ dr = csa->Contains((uchar*)"ab");
+ printDocumentResult(dr);
+
+ TextCollection::full_result fr;
+ fr = csa->FullContains((uchar *)"ab");
+ printFullResult(fr);
+
+ delete csa;
+}