Initial Import
authorkim <kim@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Mon, 24 Nov 2008 23:32:08 +0000 (23:32 +0000)
committerkim <kim@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Mon, 24 Nov 2008 23:32:08 +0000 (23:32 +0000)
git-svn-id: svn+ssh://idea.nguyen.vg/svn/sxsi/trunk/TextCollection@6 3cdefd35-fc62-479d-8e8d-bae585ffb9ca

21 files changed:
BitRank.cpp [new file with mode: 0644]
BitRank.h [new file with mode: 0644]
CSA.cpp [new file with mode: 0644]
CSA.h [new file with mode: 0644]
TextCollection.cpp [new file with mode: 0644]
TextCollection.h [new file with mode: 0644]
Tools.cpp [new file with mode: 0644]
Tools.h [new file with mode: 0644]
bittree.cpp [new file with mode: 0644]
bittree.h [new file with mode: 0644]
dependencies.mk [new file with mode: 0644]
dynFMI.cpp [new file with mode: 0644]
dynFMI.h [new file with mode: 0644]
handle.cpp [new file with mode: 0644]
handle.h [new file with mode: 0644]
makefile [new file with mode: 0644]
pos.cpp [new file with mode: 0644]
pos.h [new file with mode: 0644]
rbtree.cpp [new file with mode: 0644]
rbtree.h [new file with mode: 0644]
testTextCollection.cpp [new file with mode: 0644]

diff --git a/BitRank.cpp b/BitRank.cpp
new file mode 100644 (file)
index 0000000..61dcc42
--- /dev/null
@@ -0,0 +1,300 @@
+#include "BitRank.h"
+
+/*****************************************************************************
+ * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved.                *
+ *                                                                           *
+ * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations.            *
+ *                                                                           *
+ * This library is free software; you can redistribute it and/or             *
+ * modify it under the terms of the GNU Lesser General Public                *
+ * License as published by the Free Software Foundation; either              *
+ * version 2.1 of the License, or (at your option) any later version.        *
+ *                                                                           *
+ * This library is distributed in the hope that it will be useful,           * 
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of            *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU         *
+ * Lesser General Public License for more details.                           *
+ *                                                                           *
+ * You should have received a copy of the GNU Lesser General Public          *
+ * License along with this library; if not, write to the Free Software       *
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA *
+ ****************************************************************************/
+
+// Modified by Niko Välimäki
+/////////////
+//Rank(B,i)// 
+/////////////
+//This Class use a superblock size of 256-512 bits
+//and a block size of 32-64 bits also
+
+
+const unsigned char __popcount_tab[] =
+{
+0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+1,2,2,3,2,3,3,4,2,3,3,4,3,4,4,5,2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+2,3,3,4,3,4,4,5,3,4,4,5,4,5,5,6,3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,
+3,4,4,5,4,5,5,6,4,5,5,6,5,6,6,7,4,5,5,6,5,6,6,7,5,6,6,7,6,7,7,8,
+};
+
+const unsigned char select_tab[] =
+{
+0,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+8,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+7,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+
+6,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,5,1,2,1,3,1,2,1,4,1,2,1,3,1,2,1,
+};
+
+
+// bits needed to represent a number between 0 and n
+inline ulong bits (ulong n){
+       ulong b = 0;
+       while (n) { b++; n >>= 1; }
+       return b;
+}
+
+#if W == 32
+    // 32 bit version
+    inline unsigned popcount (register ulong x){
+        return __popcount_tab[(x >>  0) & 0xff]  + __popcount_tab[(x >>  8) & 0xff]  + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff];
+    }
+#else
+    // 64 bit version
+    inline unsigned popcount (register ulong x){
+        return __popcount_tab[(x >>  0) & 0xff]  + __popcount_tab[(x >>  8) & 0xff]  + __popcount_tab[(x >> 16) & 0xff] + __popcount_tab[(x >> 24) & 0xff] + __popcount_tab[(x >> 32) & 0xff] + __popcount_tab[(x >> 40) & 0xff] + __popcount_tab[(x >> 48) & 0xff] + __popcount_tab[(x >> 56) & 0xff];
+    }
+#endif
+
+inline unsigned popcount16 (register int x){
+  return __popcount_tab[x & 0xff]  + __popcount_tab[(x >>  8) & 0xff];
+}
+
+inline unsigned popcount8 (register int x){
+  return __popcount_tab[x & 0xff];
+}
+
+BitRank::BitRank(ulong *bitarray, ulong n, bool owner) {
+    data=bitarray;
+    this->owner = owner;
+    this->n=n;  // length of bitarray in bits
+    b = W; // b is a word
+    s=b*superFactor;
+    ulong aux=(n+1)%W;
+    if (aux != 0)
+        integers = (n+1)/W+1;
+    else 
+        integers = (n+1)/W;
+    BuildRank();
+}
+
+BitRank::~BitRank() {
+    delete [] Rs;
+    delete [] Rb;
+    if (owner) delete [] data;
+}
+
+//Build the rank (blocks and superblocks)
+void BitRank::BuildRank()
+{
+    ulong num_sblock = n/s;
+    ulong num_block = n/b;
+    Rs = new ulong[num_sblock+1];//+1 we add the 0 pos
+    Rb = new uchar[num_block+1];//+1 we add the 0 pos
+       
+       ulong j;
+    Rs[0] = 0lu;
+       
+    for (j=1;j<=num_sblock;j++) 
+        {
+                       Rs[j]=BuildRankSub((j-1)*superFactor,superFactor)+Rs[j-1];
+               }
+    
+    Rb[0]=0;
+    for (ulong k=1;k<=num_block;k++) {
+        j = k / superFactor;
+        Rb[k]=BuildRankSub(j*superFactor, k%superFactor);
+         }
+}
+
+ulong BitRank::BuildRankSub(ulong ini, ulong bloques){
+    ulong rank=0,aux;
+    
+       
+       for(ulong i=ini;i<ini+bloques;i++) {
+               if (i < integers) {
+                       aux=data[i];
+                       rank+=popcount(aux);
+               }
+       }
+     return rank; //return the numbers of 1's in the interval
+}
+
+
+//this rank ask from 0 to n-1
+ulong BitRank::rank(ulong i) {
+    ++i; // the following gives sum of 1s before i 
+    return Rs[i>>8]+Rb[i>>wordShift]
+        +popcount(data[i >> wordShift] & ((1lu << (i & Wminusone))-1));
+}
+
+ulong BitRank::select(ulong x) {
+    // returns i such that x=rank(i) && rank(i-1)<x or n if that i not exist
+    // first binary search over first level rank structure
+    // then sequential search using popcount over a int
+    // then sequential search using popcount over a char
+    // then sequential search bit a bit
+    
+    //binary search over first level rank structure
+    if (x == 0)
+        return 0;
+        
+    ulong l=0, r=n/s;
+    ulong mid=(l+r)/2;      
+    ulong rankmid = Rs[mid];
+    while (l<=r) {
+        if (rankmid<x)
+            l = mid+1;
+        else
+            r = mid-1;
+        mid = (l+r)/2;              
+        rankmid = Rs[mid];
+    }    
+    //sequential search using popcount over a int
+    ulong left;
+    left=mid*superFactor;
+    x-=rankmid;
+    ulong j = data[left];
+    
+    unsigned ones = popcount(j);
+    while (ones < x) {
+        x-=ones;left++;
+        if (left > integers) 
+            return n;
+            
+        j = data[left];
+        
+        ones = popcount(j);
+    }
+    //sequential search using popcount over a char
+    left=left*b; 
+    rankmid = popcount8(j);
+    if (rankmid < x) {
+        j=j>>8;
+        x-=rankmid;
+        left+=8;
+        rankmid = popcount8(j);
+        if (rankmid < x) {
+            j=j>>8;
+            x-=rankmid;
+            left+=8;
+            rankmid = popcount8(j);
+            if (rankmid < x) {
+                j=j>>8;
+                x-=rankmid;
+                left+=8;
+            }
+        }
+    }
+
+    // then sequential search bit a bit
+    while (x>0) {
+        if  (j&1lu) x--;
+        j=j>>1;
+        left++;
+    }
+    return left-1;
+}
+
+ulong BitRank::select0(ulong x) {
+    // returns i such that x=rank0(i) && rank0(i-1)<x or n if that i not exist
+    // first binary search over first level rank structure
+    // then sequential search using popcount over a int
+    // then sequential search using popcount over a char
+    // then sequential search bit a bit
+    
+    //binary search over first level rank structure
+    if (x == 0)
+        return 0;
+        
+    ulong l=0, r=n/s;
+    ulong mid=(l+r)/2;
+    ulong rankmid = mid * s - Rs[mid];
+    while (l<=r) {
+        if (rankmid<x)
+            l = mid+1;
+        else
+            r = mid-1;
+        mid = (l+r)/2;              
+        rankmid = mid * s - Rs[mid];
+    }    
+    
+    //sequential search using popcount over a int
+    ulong left;
+    left=mid*superFactor;
+    x-=rankmid;
+    ulong j = data[left];
+    
+    unsigned zeros = W - popcount(j);
+    while (zeros < x) {
+        x-=zeros;
+        left++;
+        if (left > integers) 
+            return n;
+            
+        j = data[left];
+        zeros = W - popcount(j);
+    }
+    
+    //sequential search using popcount over a char
+    left=left*b;
+    rankmid = 8 - popcount8(j);
+    if (rankmid < x) {
+        j=j>>8;
+        x-=rankmid;
+        left+=8;
+        rankmid = 8 - popcount8(j);
+        if (rankmid < x) {
+            j=j>>8;
+            x-=rankmid;
+            left+=8;
+            rankmid = 8 - popcount8(j);
+            if (rankmid < x) {
+                j=j>>8;
+                x-=rankmid;
+                left+=8;
+            }
+        }
+    }
+
+    // then sequential search bit a bit
+    while (x>0) {
+        if  (!(j&1lu)) x--;
+        j=j>>1;
+        left++;
+    }
+    return left - 1;
+}
+
+
+bool BitRank::IsBitSet(ulong i) {
+    return (1lu << (i % W)) & data[i/W];
+}
+
+ulong BitRank::NumberOfBits() {
+    return n;
+}
diff --git a/BitRank.h b/BitRank.h
new file mode 100644 (file)
index 0000000..c7783d2
--- /dev/null
+++ b/BitRank.h
@@ -0,0 +1,56 @@
+/*****************************************************************************
+ * Copyright (C) 2005, Rodrigo Gonzalez, all rights reserved.                *
+ *                                                                           *
+ * New RANK, SELECT, SELECT-NEXT and SPARSE RANK implementations.            *
+ *                                                                           *
+ * This library is free software; you can redistribute it and/or             *
+ * modify it under the terms of the GNU Lesser General Public                *
+ * License as published by the Free Software Foundation; either              *
+ * version 2.1 of the License, or (at your option) any later version.        *
+ *                                                                           *
+ * This library is distributed in the hope that it will be useful,           * 
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of            *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU         *
+ * Lesser General Public License for more details.                           *
+ *                                                                           *
+ * You should have received a copy of the GNU Lesser General Public          *
+ * License along with this library; if not, write to the Free Software       *
+ * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301 USA *
+ ****************************************************************************/
+
+#ifndef _BITSTREAM_H_
+#define _BITSTREAM_H_
+
+#include "Tools.h"
+
+class BitRank {
+private:
+    // Check word length
+#if W == 32
+    static const unsigned wordShift = 5;
+    static const unsigned superFactor = 8; // 256 bit blocks
+#else
+    static const unsigned wordShift = 6;
+    static const unsigned superFactor = 4; // 256 bit blocks
+#endif
+    
+    ulong *data; //here is the bit-array
+    bool owner;
+    ulong n,integers;
+    unsigned b,s; 
+    ulong *Rs; //superblock array
+    uchar *Rb; //block array
+    ulong BuildRankSub(ulong,  ulong); //internal use of BuildRank
+    void BuildRank(); //crea indice para rank
+public:
+    BitRank(ulong *, ulong, bool);
+    ~BitRank(); //destructor    
+    ulong rank(ulong i); //Rank from 0 to n-1
+    ulong select(ulong x); // gives the position of the x:th 1.
+    ulong select0(ulong x); // gives the position of the x:th 0.
+
+    bool IsBitSet(ulong i);
+    ulong NumberOfBits();
+};
+
+#endif
diff --git a/CSA.cpp b/CSA.cpp
new file mode 100644 (file)
index 0000000..a15c733
--- /dev/null
+++ b/CSA.cpp
@@ -0,0 +1,976 @@
+/******************************************************************************
+ *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
+ *                                                                            *
+ *                                                                            *
+ *   This program is free software; you can redistribute it and/or modify     *
+ *   it under the terms of the GNU Lesser General Public License as published *
+ *   by the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                      *
+ *                                                                            *
+ *   This program is distributed in the hope that it will be useful,          *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
+ *   GNU Lesser General Public License for more details.                      *
+ *                                                                            *
+ *   You should have received a copy of the GNU Lesser General Public License *
+ *   along with this program; if not, write to the                            *
+ *   Free Software Foundation, Inc.,                                          *
+ *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
+ *****************************************************************************/
+#include "CSA.h"
+#include "TextCollection.h"
+#include <iostream>
+#include <queue>
+#include <iomanip>
+#include <set>
+#include <vector>
+#include <utility>
+#include <cassert>
+#include <cstring> // For strlen()
+using std::vector;
+using std::pair;
+using std::make_pair;
+using std::map;
+
+using SXSI::TextCollection;
+
+
+////////////////////////////////////////////////////////////////////////////
+// Class CSA::THuffAlphabetRank
+
+CSA::THuffAlphabetRank::THuffAlphabetRank(uchar *s, TextPosition n, TCodeEntry *codetable, unsigned level) {
+    left = NULL;
+    right = NULL;
+    bitrank = NULL;
+    ch = s[0];
+    leaf = false;
+    this->codetable = codetable;
+    
+    bool *B = new bool[n];
+    TextPosition sum=0,i;
+    /*
+    for (i=0; i< n; i++) {
+       printf("%c:", (char)((int)s[i]-128));
+       for (r=0;r<codetable[(int)s[i]].bits;r++)
+          if (codetable[(int)s[i]].code & (1u <<r))
+            printf("1");
+         else printf("0");
+       printf("\n");     
+    }
+    printf("\n");
+    if (level > 100) return;*/
+    for (i=0;i<n;i++) 
+       if (codetable[(int)s[i]].code & (1u << level)) {
+          B[i] = true;
+          sum++;
+       }
+       else B[i] = false;
+    if (sum==0 || sum==n) {
+        delete [] B;
+       leaf = true;
+        return;
+    } 
+    uchar *sfirst, *ssecond;
+    //if (n-sum > 0)  
+        sfirst = new uchar[n-sum];
+    //if (sum > 0) 
+        ssecond = new uchar[sum];
+    unsigned j=0,k=0;
+   for (i=0;i<n;i++)
+        if (B[i]) ssecond[k++] = s[i];
+        else sfirst[j++] = s[i];
+    ulong *Binbits = new ulong[n/W+1];
+    for (i=0;i<n;i++)
+        Tools::SetField(Binbits,1,i,B[i]); 
+    delete [] B;
+    bitrank = new BitRank(Binbits,n,true);
+    //if (j > 0) { 
+        left = new THuffAlphabetRank(sfirst,j,codetable,level+1); 
+        delete [] sfirst;
+    //}
+    //if (k>0) {
+        right = new THuffAlphabetRank(ssecond,k,codetable,level+1); 
+        delete [] ssecond;
+    //}
+}
+
+
+bool CSA::THuffAlphabetRank::Test(uchar *s, ulong n) {
+    // testing that the code works correctly
+    int C[256];
+    unsigned i,j;
+    bool correct=true;
+    for (j=0;j<256;j++)
+        C[j] = 0;
+    for (i=0;i<n;i++) {
+        C[(int)s[i]]++;
+        if (C[(int)s[i]] != (int)rank((int)s[i],i)) {
+        correct = false;
+        printf("%d (%c): %d<>%d\n",i,(int)s[i]-128,C[(int)s[i]],(int)rank((int)s[i],i)); 
+        }         
+    }
+    return correct;            
+}
+
+CSA::THuffAlphabetRank::~THuffAlphabetRank() {
+    if (left!=NULL) delete left;
+    if (right!=NULL) delete right;
+    if (bitrank!=NULL)   
+        delete bitrank;
+}
+
+
+////////////////////////////////////////////////////////////////////////////
+// Class CSA
+
+/**
+ * Constructor inits an empty dynamic FM-index.
+ * Samplerate defaults to TEXTCOLLECTION_DEFAULT_SAMPLERATE.
+ */
+CSA::CSA(unsigned samplerate)
+    : n(0), alphabetrank(0), sampled(0), suffixes(0), positions(0),  
+      codetable(0), dynFMI(0)
+{
+    this->samplerate = samplerate;
+
+    // FIXME TODO : DynFMI needs distribution of characters before hand
+    // This will create fully balanced wavelet tree for all chars [0, 255].
+    uchar temp[256];
+    for (unsigned i = 0; i < 255; ++i)
+        temp[i] = i+1;
+    temp[255] = 0;
+    dynFMI = new DynFMI(temp, 1, 255, false);
+}
+
+/**
+ * Insert new text
+ *
+ * Given text must include end-marker.
+ * Text identifiers are numbered starting from 0.
+ */
+void CSA::InsertText(uchar const * text)
+{
+    // Sanity check:
+    assert(dynFMI != 0); 
+
+    TextPosition m = std::strlen((char *)text) + 1;
+    // Store text length and starting position
+    textLength.push_back(make_pair(m, n));
+    this->n += m;
+    dynFMI->addText(text, m);
+}
+
+void CSA::MakeStatic()
+{
+    // Sanity check:
+    if (dynFMI == 0)
+    {
+        std::cerr << "Data structure is already static (dynFMI == 0)." << std::endl;
+        std::exit(0);
+    }
+    uchar *bwt = dynFMI->getBWT();
+    /*printf("1234567890123456789\n");
+    for (TextPosition i = 0; i < n; i ++)
+        if (bwt[i] < 50)
+            printf("%d", (int)bwt[i]);
+        else
+            printf("%c", bwt[i]);
+    printf("\n");
+    */
+    
+/*    for (TextPosition i = 1; i <= n; i ++)
+    {
+        printf("LF[%lu, %c] = %lu\n", i, (*dynFMI)[i], dynFMI->LFmapping(i));
+    }*/
+    
+    // Sanity check
+    assert(textLength.size() == dynFMI->getCollectionSize());    
+
+    delete dynFMI;
+    dynFMI = 0;
+
+    ulong i, min = 0,
+             max;
+    for (i=0;i<256;i++)
+        C[i]=0;
+    for (i=0;i<n;++i)
+        C[(int)bwt[i]]++;
+    for (i=0;i<256;i++)
+        if (C[i]>0) {min = i; break;}          
+    for (i=255;i>=min;--i)
+        if (C[i]>0) {max = i; break;}
+    
+    // Print frequencies
+/*    for(i = 0; i < 256; i++)
+        if (C[i]>0) printf("C[%lu] = %lu\n", i, C[i]);
+        fflush(stdout);*/
+
+    ulong prev=C[0], temp;
+    C[0]=0;
+    for (i=1;i<256;i++) {          
+        temp = C[i];
+        C[i]=C[i-1]+prev;
+        prev = temp;
+    }
+    this->codetable = node::makecodetable(bwt,n);
+    alphabetrank = new THuffAlphabetRank(bwt,n, this->codetable,0);   
+    //if (alphabetrank->Test(bwt,n)) printf("alphabetrank ok\n");    
+
+/*    for (i = 0; i < n; ++i)
+    {
+        uchar c = alphabetrank->charAtPos(i);
+        TextPosition j = C[c]+alphabetrank->rank(c, i)-1;
+        printf("LF[%lu] = %lu\n", i, j);
+        }*/
+
+    // Calculate BWT end-marker position (of last inserted text)
+    i = 0;
+    while (bwt[i] != '\0')
+    {
+        uchar c = alphabetrank->charAtPos(i);
+        i = C[c]+alphabetrank->rank(c, i)-1;
+    }
+    bwtEndPos = i;
+    //printf("bwtEndPos = %lu\n", bwtEndPos);
+     
+    delete [] bwt;
+
+    // Make sampling tables
+    maketables();
+    // to avoid running out of unsigned, the sizes are computed in specific order (large/small)*small
+    // |class CSA| +256*|TCodeEntry|+|C[]|+|suffixes[]+positions[]|+...       
+    //printf("FMindex takes %d B\n",
+    //    6*W/8+256*3*W/8+256*W/8+ (2*n/(samplerate*8))*W+sampled->SpaceRequirementInBits()/8+alphabetrank->SpaceRequirementInBits()/8+W/8);
+}
+
+
+uchar* CSA::GetText(DocId k) const
+{
+    assert(k < (DocId)textLength.size());
+
+    uchar* result = new uchar[textLength[k].first];
+    TextPosition i = k;
+    TextPosition j = textLength[k].first-1;
+    result[j] = '\0';
+
+    uchar c = alphabetrank->charAtPos(i);
+    while (c != '\0')
+    {
+        --j;
+        result[j] = c;
+        i = C[c]+alphabetrank->rank(c,i)-1;
+        
+        c = alphabetrank->charAtPos(i); // "next" char.
+    }
+    assert(j == 0);
+    return result;
+}
+
+uchar* CSA::GetText(DocId k, TextPosition i, TextPosition j) const
+{
+    assert(k < (DocId)textLength.size());
+    assert(j < textLength[k].first);
+    assert(i <= j);
+
+    // Start position of k'th text
+    ulong start = textLength[k].second;
+
+    return Substring(i + start, j-i+1);
+}
+
+/**
+ * Existential queries
+ */
+bool CSA::IsPrefix(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return textLength.size();
+
+    TextPosition sp = 0, ep = 0;
+    Search(pattern, m, &sp, &ep);
+
+    // Check for end-marker(s) in result interval
+    map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+    it = endmarkers.lower_bound(sp);
+    if (it != endmarkers.end() && it->first <= ep)
+        return true;
+    return false;
+}
+
+bool CSA::IsSuffix(uchar const *pattern) const
+{
+    // Here counting is as fast as isSuffix():
+    if (CountSuffix(pattern) > 0)
+        return true;
+    return false;
+}
+
+bool CSA::IsEqual(uchar const *pattern) const
+{
+    TextPosition m = std::strlen((char *)pattern);
+    if (m == 0)
+        return textLength.size();
+
+    TextPosition sp = 0, ep = 0;
+    // Match including end-marker
+    Search(pattern, m+1, &sp, &ep);
+
+    // Check for end-marker(s) in result interval
+    map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+    it = endmarkers.lower_bound(sp);
+    if (it != endmarkers.end() && it->first <= ep)
+        return true;
+    return false;
+}
+
+bool CSA::IsContains(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return true;
+
+    TextPosition sp = 0, ep = 0;
+    // Just check if pattern exists somewhere
+    ulong count = Search(pattern, m, &sp, &ep);
+    if (count > 0)
+        return true;
+    return false;
+}
+
+bool CSA::IsLessThan(uchar const*) const
+{
+    // TODO
+    return false;
+}
+
+/**
+ * Counting queries
+ */
+unsigned CSA::CountPrefix(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return textLength.size();
+
+    TextPosition sp = 0, ep = 0;
+    Search(pattern, m, &sp, &ep);
+
+    // Count end-markers in result interval
+    map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+    it = endmarkers.lower_bound(sp);
+    unsigned count = 0;
+    while (it != endmarkers.end() && it->first <= ep)
+    {
+        ++ count;
+        ++ it;
+    }
+    return count;
+}
+
+unsigned CSA::CountSuffix(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return textLength.size();
+
+    TextPosition sp = 0, ep = 0;
+    // Search with end-marker
+    unsigned count = (unsigned) Search(pattern, m+1, &sp, &ep);
+    return count;
+}
+
+unsigned CSA::CountEqual(uchar const *pattern) const
+{
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return textLength.size();
+
+    TextPosition sp = 0, ep = 0;
+    // Match including end-marker
+    Search(pattern, m+1, &sp, &ep);
+
+    // Count end-markers in result interval
+    map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+    it = endmarkers.lower_bound(sp);
+    unsigned count = 0;
+    while (it != endmarkers.end() && it->first <= ep)
+    {
+        ++ count;
+        ++ it;
+    }
+    return count;
+}
+
+unsigned CSA::CountContains(uchar const * pattern) const
+{
+    // Here counting is as slow as fetching the result set
+    // because we would have to filter out occ's that fall within same document.
+    TextCollection::document_result result = Contains(pattern);
+    return result.size();
+}
+
+unsigned CSA::CountLessThan(uchar const * pattern) const
+{
+    // TODO
+    return 0;
+}
+
+/**
+ * Document reporting queries
+ */
+TextCollection::document_result CSA::Prefix(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result();
+
+    TextPosition sp = 0, ep = 0;
+    Search(pattern, m, &sp, &ep);
+
+    TextCollection::document_result result;
+    
+    // Report end-markers in result interval
+    map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+    it = endmarkers.lower_bound(sp);
+    unsigned count = 0;
+    while (it != endmarkers.end() && it->first <= ep)
+    {
+        // End-marker we found belongs to the "previous" doc in the collection
+        DocId docId = ((it->second).first + 1) % textLength.size();
+        result.push_back(docId);
+        ++ count;
+        ++ it;
+    }
+    return result;
+}
+
+TextCollection::document_result CSA::Suffix(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result();
+
+    TextPosition sp = 0, ep = 0;
+    // Search with end-marker
+    Search(pattern, m+1, &sp, &ep);
+    TextCollection::document_result result;
+
+    // Check each occurrence
+    for (; sp <= ep; ++sp)
+    {
+        TextPosition i = sp;
+        uchar c = alphabetrank->charAtPos(i);
+        while (c != '\0' && !sampled->IsBitSet(i))
+        {
+            i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+            c = alphabetrank->charAtPos(i);
+        }
+        if (c == '\0')
+        {
+            // map::operator[] is not const, using find() instead:
+            pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+            // End-marker that we found belongs to the "previous" doc in collection:
+            DocId docId = (endm.first + 1) % textLength.size();
+            result.push_back(docId);
+        }
+        else
+        {
+            TextPosition textPos = suffixes[sampled->rank(i)-1];
+            result.push_back(DocIdAtTextPos(textPos));
+        }
+    }
+    
+    return result;
+}
+
+TextCollection::document_result CSA::Equal(uchar const *pattern) const
+{
+    TextPosition m = strlen((char const *)pattern);
+    if (m == 0)
+        return TextCollection::document_result();
+
+    TextPosition sp = 0, ep = 0;
+    // Match including end-marker
+    Search(pattern, m+1, &sp, &ep);
+
+    TextCollection::document_result result;
+
+    // Report end-markers in result interval
+    map<TextPosition, pair<DocId, TextPosition> >::const_iterator it;
+    it = endmarkers.lower_bound(sp);
+    unsigned count = 0;
+    while (it != endmarkers.end() && it->first <= ep)
+    {
+        // End-marker that we found belongs to the "previous" doc in collection:
+        DocId docId = ((it->second).first + 1) % textLength.size();
+        result.push_back(docId);
+        ++ count;
+        ++ it;
+    }
+    return result;
+}
+
+
+TextCollection::document_result CSA::Contains(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return TextCollection::document_result();
+
+    TextPosition sp = 0, ep = 0;
+    // Search all occurrences
+    Search(pattern, m, &sp, &ep);
+    // We want unique document indentifiers, using std::set to collect them
+    std::set<DocId> resultSet;
+
+    // Check each occurrence
+    for (; sp <= ep; ++sp)
+    {
+        TextPosition i = sp;
+        uchar c = alphabetrank->charAtPos(i);
+        while (c != '\0' && !sampled->IsBitSet(i))
+        {
+            i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+            c = alphabetrank->charAtPos(i);
+        }
+        if (c == '\0')
+        {
+            // map::operator[] is not const, using find() instead:
+            pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+            // End-marker we found belongs to the "previous" doc in collection
+            DocId docId = (endm.first + 1) % textLength.size();
+            resultSet.insert(docId);
+        }
+        else
+        {
+            TextPosition textPos = suffixes[sampled->rank(i)-1];
+            resultSet.insert(DocIdAtTextPos(textPos));
+        }
+    }
+    
+    // Convert std::set to std::vector (TODO better way to construct result vector?)
+    TextCollection::document_result result;
+    for (std::set<DocId>::iterator it = resultSet.begin(); it != resultSet.end(); ++it)
+        result.push_back(*it);
+    return result;
+}
+
+TextCollection::document_result CSA::LessThan(uchar const * pattern) const
+{
+    // TODO
+    return document_result();
+}
+
+/**
+ * Full result set queries
+ */
+TextCollection::full_result CSA::FullContains(uchar const * pattern) const
+{
+    TextPosition m = strlen((char *)pattern);
+    if (m == 0)
+        return full_result();
+
+    TextPosition sp = 0, ep = 0;
+    // Search all occurrences
+    Search(pattern, m, &sp, &ep);
+    full_result result;
+
+    // Report each occurrence
+    for (; sp <= ep; ++sp)
+    {
+        TextPosition i = sp;
+        TextPosition dist = 0;
+        uchar c = alphabetrank->charAtPos(i);
+        while (c != '\0' && !sampled->IsBitSet(i))
+        {
+            i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping
+            c = alphabetrank->charAtPos(i);
+            ++ dist;
+        }
+        if (c == '\0')
+        {
+            // map::operator[] is not const, using find() instead:
+            pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+            // End-marker we found belongs to the "previous" doc in the collection:
+            DocId docId = (endm.first+1)%textLength.size();
+            result.push_back(make_pair(docId, dist)); 
+        }
+        else
+        {
+            TextPosition textPos = suffixes[sampled->rank(i)-1] + dist;
+            // Read document identifier and its starting position in text order
+            DocId docId = DocIdAtTextPos(textPos);
+            result.push_back(make_pair(docId, textPos - textLength[docId].second));
+        }
+    }
+    
+    return result;
+}
+
+
+/**
+ * Finds document identifier for given text position
+ *
+ * Starting text position of the document is stored into second parameter.
+ * Binary searching on text starting positions. 
+ */
+TextCollection::DocId CSA::DocIdAtTextPos(TextPosition i) const
+{
+    assert(i < n);
+
+    DocId a = 0;
+    DocId b = textLength.size() - 1;
+    while (a < b)
+    {
+        DocId c = a + (b - a)/2;
+        if (textLength[c].second > i)
+            b = c - 1;
+        else if (textLength[c+1].second > i)
+            return c;
+        else
+            a = c + 1;
+    }
+
+    assert(a < (DocId)textLength.size());
+    assert(i > textLength[a].second);
+    assert(i < (a == (DocId)textLength.size() - 1 ? n : textLength[a+1].second));
+    return a;
+}
+
+void CSA::Load(FILE *filename, unsigned samplerate)
+{
+    // TODO
+
+}
+
+void CSA::Save(FILE *filename)
+{
+    // TODO
+}
+
+
+/**
+ * Rest of the functions follow...
+ */
+
+uchar * CSA::Substring(TextPosition i, TextPosition l) const
+{
+    uchar *result = new uchar[l + 1];
+    if (l == 0)
+    {
+        result[0] = 0u;
+        return result;
+    }
+      
+    TextPosition dist;
+    TextPosition k = i + l - 1;
+    // Check for end of the string
+    if (k > n - 1)
+    {
+        l -= k - n + 1;
+        k = n - 1;
+    }
+    
+    TextPosition skip = samplerate - k % samplerate - 1;
+    TextPosition j;
+    if (k / samplerate + 1 >= n / samplerate)
+    {
+        j = bwtEndPos;
+        skip = n - k - 1;
+    }
+    else
+    {
+        j = positions[k/samplerate+1];
+        //cout << samplerate << ' ' << j << '\n';      
+    }    
+    
+    for (dist = 0; dist < skip + l; dist++) 
+    {
+        int c = alphabetrank->charAtPos(j);
+        if (c == '\0')
+        {
+            // map::operator[] is not const, using find() instead:
+            pair<DocId, TextPosition> endm = (endmarkers.find(j))->second;
+            j = endm.first; // LF-mapping for end-marker
+        }
+        else
+            j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
+        if (dist >= skip)
+            result[l + skip - dist - 1] = c;
+    }
+    result[l] = 0u;
+    return result;
+}
+
+  /*TextPosition CSA::inverse(TextPosition i)
+{
+    TextPosition skip = samplerate - i % samplerate;
+    TextPosition j;
+    if (i / samplerate + 1 >= n / samplerate)
+    {
+        j = bwtEndPos;
+        skip = n - i;
+    }
+    else
+    {
+        j = positions[i/samplerate+1];
+        //cout << samplerate << ' ' << j << '\n';   
+    }    
+    
+    while (skip > 0)
+    {
+        int c = alphabetrank->charAtPos(j);
+        j = C[c]+alphabetrank->rank(c,j)-1; // LF-mapping
+        skip --;
+    }
+    return j;
+    }*/
+ulong CSA::Search(uchar const * pattern, TextPosition m, TextPosition *spResult, TextPosition *epResult) const {
+    // use the FM-search replacing function Occ(c,1,i) with alphabetrank->rank(c,i)
+    int c = (int)pattern[m-1]; 
+    TextPosition i=m-1;
+    TextPosition sp = C[c];
+    TextPosition ep = C[c+1]-1;
+    while (sp<=ep && i>=1) 
+    {
+//         printf("i = %lu, c = %c, sp = %lu, ep = %lu\n", i, pattern[i], sp, ep);
+        c = (int)pattern[--i];
+        sp = C[c]+alphabetrank->rank(c,sp-1);
+        ep = C[c]+alphabetrank->rank(c,ep)-1;
+    }
+    *spResult = sp;
+    *epResult = ep;
+    if (sp<=ep)
+        return ep - sp + 1;
+    else
+        return 0;
+ }
+
+   /*TextPosition CSA::LF(uchar c, TextPosition &sp, TextPosition &ep) 
+{
+    sp = C[(int)c]+alphabetrank->rank(c,sp-1);
+    ep = C[(int)c]+alphabetrank->rank(c,ep)-1;
+
+    if (sp<=ep)
+        return ep - sp + 1;
+    else
+        return 0;
+        }*/
+
+CSA::TextPosition CSA::Lookup(TextPosition i) const // Time complexity: O(samplerate log \sigma)
+{
+    TextPosition dist=0;
+    while (!sampled->IsBitSet(i)) 
+    {
+        int c = alphabetrank->charAtPos(i);
+        if (c == '\0')
+        {
+            // map::operator[] is not const, using find() instead:
+            pair<DocId, TextPosition> endm = (endmarkers.find(i))->second;
+            return endm.second + dist; // returns text position.
+        }
+        i = C[c]+alphabetrank->rank(c,i)-1; // LF-mapping  
+        ++dist;
+    }
+    
+    return suffixes[sampled->rank(i)-1]+dist;
+}
+
+CSA::~CSA() {
+    delete alphabetrank;       
+    delete sampled;
+    delete [] suffixes;
+    delete [] positions;
+    delete [] codetable;
+}
+
+
+void CSA::maketables()
+{
+    ulong sampleLength = (n%samplerate==0) ? n/samplerate : n/samplerate+1;
+   
+    ulong *sampledpositions = new ulong[n/W+1];
+    suffixes = new ulong[sampleLength];   
+    positions = new ulong[sampleLength];
+   
+    ulong i,j=0;
+    for (i=0;i<n/W+1;i++)
+        sampledpositions[i]=0lu;
+    
+    ulong x,p=bwtEndPos;
+    DocId textId = textLength.size();
+    ulong ulongmax = 0;
+    ulongmax--;
+
+   //positions:
+    for (i=n-1;i<ulongmax;i--) { // TODO bad solution with ulongmax?
+      // i substitutes SA->GetPos(i)
+        x=(i==n-1)?0:i+1;
+
+        if (x % samplerate == 0) {
+            Tools::SetField(sampledpositions,1,p,1);
+            positions[x/samplerate] = p;
+        }
+
+        uchar c = alphabetrank->charAtPos(p);
+        if (c == '\0')
+        {
+            // Record the order of end-markers in BWT:
+            --textId;
+            endmarkers[p] = make_pair(textId, (TextPosition)x);
+            // LF-mapping from '\0' does not work with this (pseudo) BWT (see details from Wolfgang's thesis).
+            p = textId; // Correct LF-mapping to the last char of the previous text.
+        }
+        else
+            p = C[c]+alphabetrank->rank(c, p)-1;
+    }
+    assert(textId == 0);
+    /*for (map<ulong, pair<DocId, ulong> >::iterator it = endmarkers.begin(); it != endmarkers.end(); ++it)
+    { 
+        printf("endm[%u] = %lu (text pos: %lu)\n", (it->second).first, it->first, (it->second).second);
+        }*/
+
+    sampled = new BitRank(sampledpositions,n,true);   
+     
+   //suffixes:   
+    for(i=0; i<sampleLength; i++) {
+        j = sampled->rank(positions[i]);
+        if (j==0) j=sampleLength;
+        suffixes[ j-1] = (i*samplerate==n)?0:i*samplerate;
+    }   
+}
+
+uchar * CSA::LoadFromFile(const char *filename)
+{
+    uchar *s;
+    std::ifstream file (filename, std::ios::in|std::ios::binary);
+    if (file.is_open())
+    {
+        std::cerr << "Loading CSA from file: " << filename << std::endl;
+        file.read((char *)&bwtEndPos, sizeof(TextPosition));
+        s = new uchar[n];
+        for (ulong offset = 0; offset < n; offset ++)
+            file.read((char *)(s + offset), sizeof(char));
+        file.close();
+    }
+    else 
+    {
+        std::cerr << "Unable to open file " << filename << std::endl;
+        exit(1);
+    }
+    return s;
+}
+
+void CSA::SaveToFile(const char *filename, uchar *bwt)
+{
+    std::ofstream file (filename, std::ios::out|std::ios::binary|std::ios::trunc);
+    if (file.is_open())
+    {
+        std::cerr << "Writing CSA to file: " << filename << std::endl;
+        file.write((char *)&bwtEndPos, sizeof(TextPosition));
+        std::cerr << "Writing BWT of " << n << " bytes." << std::endl;
+        for (ulong offset = 0; offset < n; offset ++)
+            file.write((char *)(bwt + offset), sizeof(char));
+        file.close();
+    }
+    else 
+    {
+        std::cerr << "Unable to open file " << filename << std::endl;
+        exit(1);
+    }
+}
+
+/*uchar * CSA::BWT(uchar *text) 
+{
+    uchar *s;
+
+    DynFMI *wt = new DynFMI((uchar *) text, n);
+    s = wt->getBWT();
+    for (ulong i=0;i<n;i++) 
+        if (s[i]==0u) {
+            bwtEndPos = i;  // TODO: better solution ?
+            i=n;
+        }
+
+    delete wt;
+    return s;
+    }*/
+
+CSA::TCodeEntry * CSA::node::makecodetable(uchar *text, TextPosition n)
+{
+    TCodeEntry *result = new TCodeEntry[ 256 ];
+    
+    count_chars( text, n, result );
+    std::priority_queue< node, std::vector< node >, std::greater<node> > q;
+//
+// First I push all the leaf nodes into the queue
+//
+    for ( unsigned int i = 0 ; i < 256 ; i++ )
+        if ( result[ i ].count )
+            q.push(node( i, result[ i ].count ) );
+//
+// This loop removes the two smallest nodes from the
+// queue.  It creates a new internal node that has
+// those two nodes as children. The new internal node
+// is then inserted into the priority queue.  When there
+// is only one node in the priority queue, the tree
+// is complete.
+//
+
+    while ( q.size() > 1 ) {
+        node *child0 = new node( q.top() );
+        q.pop();
+        node *child1 = new node( q.top() );
+        q.pop();
+        q.push( node( child0, child1 ) );
+    }
+//
+// Now I compute and return the codetable
+//
+    q.top().maketable(0u,0u, result);
+    q.pop();
+    return result;
+}
+
+
+void CSA::node::maketable(unsigned code, unsigned bits, TCodeEntry *codetable) const
+{
+    if ( child0 ) 
+    {
+        child0->maketable( SetBit(code,bits,0), bits+1, codetable );
+        child1->maketable( SetBit(code,bits,1), bits+1, codetable );
+        delete child0;
+        delete child1;
+    } 
+    else 
+    {
+        codetable[value].code = code;    
+        codetable[value].bits = bits;
+    }
+}
+
+void CSA::node::count_chars(uchar *text, TextPosition n, TCodeEntry *counts )
+{
+    ulong i;
+    for (i = 0 ; i < 256 ; i++ )
+        counts[ i ].count = 0;
+    for (i=0; i<n; i++)
+        counts[(int)text[i]].count++; 
+}
+
+unsigned CSA::node::SetBit(unsigned x, unsigned pos, unsigned bit) {
+      return x | (bit << pos);
+}
+
diff --git a/CSA.h b/CSA.h
new file mode 100644 (file)
index 0000000..1ecddc6
--- /dev/null
+++ b/CSA.h
@@ -0,0 +1,231 @@
+/******************************************************************************
+ *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
+ *                                                                            *
+ *                                                                            *
+ *   This program is free software; you can redistribute it and/or modify     *
+ *   it under the terms of the GNU Lesser General Public License as published *
+ *   by the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                      *
+ *                                                                            *
+ *   This program is distributed in the hope that it will be useful,          *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
+ *   GNU Lesser General Public License for more details.                      *
+ *                                                                            *
+ *   You should have received a copy of the GNU Lesser General Public License *
+ *   along with this program; if not, write to the                            *
+ *   Free Software Foundation, Inc.,                                          *
+ *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
+ *****************************************************************************/
+
+#ifndef _CSA_H_
+#define _CSA_H_
+
+#include <map>
+#include <vector>
+#include "BitRank.h"
+#include "dynFMI.h"
+#include "TextCollection.h"
+
+/**
+ * Implementation of the TextCollection interface
+ *
+ * Implementation notes:
+ *
+ * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
+ * which requires O(n log \sigma) bits of memory. If we know the distribution
+ * of characters beforehand, space can easily be made O(nH_0).
+ * 
+ * The method MakeStatic() constructs a Succinct Suffix Array with a huffman 
+ * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
+ *
+ * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
+ * with any FM-index variant supporting LF-mapping.
+ */
+class CSA : public SXSI::TextCollection {
+public:
+    /**
+     * Constructor with (default) samplerate
+     */
+    explicit CSA(unsigned);
+    ~CSA();
+    /**
+     * Following functions implement the interface described in TextCollection.h.
+     * For details/documentation, see TextCollection.h.
+     */
+    void InsertText(uchar const *);
+    void MakeStatic();
+    uchar* GetText(DocId) const;
+    uchar* GetText(DocId, TextPosition, TextPosition) const;
+
+    bool IsPrefix(uchar const *) const;
+    bool IsSuffix(uchar const *) const;
+    bool IsEqual(uchar const *) const;
+    bool IsContains(uchar const *) const;
+    bool IsLessThan(uchar const *) const;
+
+    unsigned CountPrefix(uchar const *) const;
+    unsigned CountSuffix(uchar const *) const;
+    unsigned CountEqual(uchar const *) const;
+    unsigned CountContains(uchar const *) const;
+    unsigned CountLessThan(const unsigned char*) const;
+    
+    // document_result is inherited from SXSI::TextCollection.
+    document_result Prefix(uchar const *) const;
+    document_result Suffix(uchar const *) const;
+    document_result Equal(uchar const *) const;
+    document_result Contains(uchar const *) const;
+    document_result LessThan(uchar const *) const;
+
+    // full_result is inherited from SXSI::TextCollection.
+    full_result FullContains(uchar const *) const;
+
+    // TODO implement:
+    void Load(FILE *, unsigned);
+    void Save(FILE *);
+
+private:
+    class TCodeEntry {
+    public:
+        unsigned count;
+        unsigned bits;
+        unsigned code;
+        TCodeEntry() {count=0;bits=0;code=0u;};
+    };   
+     
+
+    class THuffAlphabetRank {
+    // using fixed 0...255 alphabet
+    private:
+        BitRank *bitrank;
+        THuffAlphabetRank *left;
+        THuffAlphabetRank *right;
+        TCodeEntry *codetable;
+        uchar ch;
+        bool leaf;
+    public:
+        THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+        ~THuffAlphabetRank();
+        bool Test(uchar *, TextPosition);
+        
+        inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
+            THuffAlphabetRank const * temp=this;
+            if (codetable[c].count == 0) return 0;
+            unsigned level = 0;
+            unsigned code = codetable[c].code;
+            while (!temp->leaf) {
+                if ((code & (1u<<level)) == 0) {
+                i = i-temp->bitrank->rank(i); 
+                    temp = temp->left; 
+                }
+                else { 
+                    i = temp->bitrank->rank(i)-1; 
+                    temp = temp->right;
+                }
+               ++level;
+            } 
+            return i+1;
+        };   
+        inline bool IsCharAtPos(int c, TextPosition i) const {
+            THuffAlphabetRank const * temp=this;
+            if (codetable[c].count == 0) return false;
+            unsigned level = 0;
+            unsigned code = codetable[c].code;      
+            while (!temp->leaf) {
+                if ((code & (1u<<level))==0) {
+                    if (temp->bitrank->IsBitSet(i)) return false;
+                    i = i-temp->bitrank->rank(i); 
+                    temp = temp->left; 
+                }
+                else { 
+                    if (!temp->bitrank->IsBitSet(i)) return false;         
+                    i = temp->bitrank->rank(i)-1; 
+                    temp = temp->right;
+                }
+               ++level;
+            } 
+            return true;
+        }
+        inline int charAtPos(TextPosition i) const {
+            THuffAlphabetRank const * temp=this;
+            while (!temp->leaf) {
+                if (temp->bitrank->IsBitSet(i)) {
+                i = temp->bitrank->rank(i)-1;
+                temp = temp->right;
+            }
+            else {
+                i = i-temp->bitrank->rank(i); 
+                    temp = temp->left;      
+            }         
+            }
+            return (int)temp->ch;
+        }
+    };
+
+    class node {
+    private:
+        unsigned weight;
+        uchar value;
+        node *child0;
+        node *child1;
+    
+        void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
+        static void count_chars(uchar *, TextPosition , TCodeEntry *);
+        static unsigned SetBit(unsigned , unsigned , unsigned );
+    public:
+        node( unsigned char c = 0, unsigned i = 0 ) {
+            value = c;
+            weight = i;
+            child0 = 0;
+            child1 = 0;
+        }
+        
+        node( node* c0, node *c1 ) {
+            value = 0;
+            weight = c0->weight + c1->weight;
+            child0 = c0;
+            child1 = c1;
+        }
+
+      
+        bool operator>( const node &a ) const {
+            return weight > a.weight;
+        }
+
+        static TCodeEntry *makecodetable(uchar *, TextPosition);
+    };
+    
+    static const unsigned char print = 1;
+    static const unsigned char report = 1;
+    TextPosition n;
+    unsigned samplerate;
+    ulong C[256];
+    TextPosition bwtEndPos;
+    THuffAlphabetRank *alphabetrank;
+    BitRank *sampled; 
+    ulong *suffixes;
+    ulong *positions;
+    TCodeEntry *codetable;
+    DynFMI *dynFMI;
+    // Map from end-marker in BWT to pair (textId, sampled text position)
+    std::map<TextPosition, std::pair<DocId, TextPosition> > endmarkers;
+    // Vector of pairs of <text length, text start position>
+    std::vector<std::pair<TextPosition, TextPosition> > textLength;
+    
+    // Private methods
+    uchar * BWT(uchar *);
+    uchar * LoadFromFile(const char *);
+    void SaveToFile(const char *, uchar *);
+    void maketables();
+
+    // Following are not part of the public API
+    DocId DocIdAtTextPos(TextPosition) const;
+    ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+    TextPosition Lookup(TextPosition) const;
+    TextPosition Inverse(TextPosition) const;
+    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
+    TextPosition Psi(TextPosition) const;
+    uchar * Substring(TextPosition, TextPosition) const;
+};
+
+#endif
diff --git a/TextCollection.cpp b/TextCollection.cpp
new file mode 100644 (file)
index 0000000..0300899
--- /dev/null
@@ -0,0 +1,16 @@
+#include "TextCollection.h"
+#include "CSA.h"
+
+namespace SXSI
+{
+    /**
+     * Init text collection
+     *
+     * See CSA.h for more details.
+     */
+    TextCollection * TextCollection::InitTextCollection(unsigned samplerate)
+    {
+        TextCollection *result = new CSA(samplerate);
+        return result;
+    }
+}
diff --git a/TextCollection.h b/TextCollection.h
new file mode 100644 (file)
index 0000000..e7e36e2
--- /dev/null
@@ -0,0 +1,168 @@
+/******************************************************************************
+ *   Copyright (C) 2008 by Niko Valimaki <nvalimak@cs.helsinki.fi>            *
+ *   Text collection interface for an in-memory XQuery/XPath engine           *
+ *                                                                            *
+ *   This program is free software; you can redistribute it and/or modify     *
+ *   it under the terms of the GNU Lesser General Public License as published *
+ *   by the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                      *
+ *                                                                            *
+ *   This program is distributed in the hope that it will be useful,          *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
+ *   GNU Lesser General Public License for more details.                      *
+ *                                                                            *
+ *   You should have received a copy of the GNU Lesser General Public License *
+ *   along with this program; if not, write to the                            *
+ *   Free Software Foundation, Inc.,                                          *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.                *
+ ******************************************************************************/ 
+
+#ifndef _SXSI_TextCollection_h_
+#define _SXSI_TextCollection_h_
+
+#include "Tools.h" // Defines ulong and uchar.
+#include <vector>
+#include <utility> // Defines std::pair.
+
+// Default samplerate for suffix array samples
+#define TEXTCOLLECTION_DEFAULT_SAMPLERATE 64
+
+namespace SXSI
+{
+    /**
+     * General interface for a text collection
+     *
+     * Class is virtual, make objects by calling 
+     * the static method InitTextCollection().
+     */
+    class TextCollection
+    {
+    public:
+        // Type of document identifier
+        typedef int DocId;
+        // Type for text position (FIXME ulong or long?)
+        typedef ulong TextPosition;
+
+        /**
+         * Init an instance of a text collection object
+         *
+         * Returns a pointer to an object implementing this interface.
+         */
+        static TextCollection * InitTextCollection(unsigned samplerate = TEXTCOLLECTION_DEFAULT_SAMPLERATE);
+        /**
+         * Load from a file
+         *
+         * New samplerate can be given, otherwise will use the one specified in the save file!
+         * Note: This is not a static method; call InitTextCollection() first to get the object handle.
+         *
+         * Throws an exception if something goes wrong (unlikely since we are passing a file descriptor).
+         * 
+         */
+        virtual void Load(FILE *, unsigned samplerate = 0) = 0;
+        /**
+         * Save data structure into a file
+         */
+        virtual void Save(FILE *) = 0;
+        /**
+         * Virtual destructor
+         */
+        virtual ~TextCollection() { };
+        /** 
+         * Insert text
+         *
+         * Must be a zero-terminated string from alphabet [1,255].
+         * Can not be called after makeStatic().
+         * The i'th text insertion gets an identifier value i-1.
+         * In other words, document identifiers start from 0.
+         */
+        virtual void InsertText(uchar const *) = 0;
+        /**
+         * Make static
+         *
+         * Convert to a static collection; reduces space and time complexities.
+         * New texts can not be inserted after this operation.
+         */
+        virtual void MakeStatic() = 0;
+        
+        /**
+         * Displaying content
+         *
+         * Returns the i'th text in the collection.
+         * The numbering starts from 0.
+         */
+        virtual uchar* GetText(DocId) const = 0;
+        /**
+         * Returns substring [i, j] of k'th text
+         *
+         * Note: Parameters i and j are text positions inside the k'th text.
+         */
+        virtual uchar* GetText(DocId, TextPosition, TextPosition) const = 0;
+        /**
+         * Returns backwards (reverse) iterator to the end of i'th text
+         * 
+         * Note: Do we need this?
+         * Forward iterator would be really in-efficient compared to
+         * getText(k) and getText(k, i, j).
+         *
+         * TODO Define and implement const_reverse_iterator.
+         */
+        //const_reverse_iterator rend(DocId) const;
+        
+        /**
+         * Existential queries
+         */
+        // Is there a text prefixed by given string?
+        virtual bool IsPrefix(uchar const *) const = 0;
+        // Is there a text having given string as a suffix?
+        virtual bool IsSuffix(uchar const *) const = 0;
+        // Is there a text that equals given string?
+        virtual bool IsEqual(uchar const *) const = 0;
+        // Does a text contain given string?
+        virtual bool IsContains(uchar const *) const = 0;
+        // Is there a text that is lexicographically less than given string?
+        virtual bool IsLessThan(uchar const *) const = 0;
+
+        /**
+         * Counting queries 
+         * 
+         * Result is the number of documents.
+         */
+        virtual unsigned CountPrefix(uchar const *) const = 0;
+        virtual unsigned CountSuffix(uchar const *) const = 0;
+        virtual unsigned CountEqual(uchar const *) const = 0;
+        virtual unsigned CountContains(uchar const *) const = 0;
+        virtual unsigned CountLessThan(uchar const *) const = 0;
+
+        /**
+         * Document reporting queries
+         *
+         * Result is a vector of document id's in some undefined order.
+         */
+        // Data type for results
+        typedef std::vector<DocId> document_result;
+        virtual document_result Prefix(uchar const *) const = 0;
+        virtual document_result Suffix(uchar const *) const = 0;
+        virtual document_result Equal(uchar const *) const = 0;
+        virtual document_result Contains(uchar const *) const = 0;
+        virtual document_result LessThan(uchar const *) const = 0;
+
+        /**
+         * Full reporting queries
+         *
+         * Result is a vector of pairs <doc id, offset> in some undefined order.
+         */
+        // Data type for results
+        typedef std::vector<std::pair<DocId, TextPosition> > full_result;
+        virtual full_result FullContains(uchar const *) const = 0;
+
+    protected:
+        // Protected constructor; call the static function InitTextCollection().
+        TextCollection() { };
+
+        // No copy constructor or assignment
+        TextCollection(TextCollection const&);
+        TextCollection& operator = (TextCollection const&);
+    };
+}
+#endif
diff --git a/Tools.cpp b/Tools.cpp
new file mode 100644 (file)
index 0000000..d7260d9
--- /dev/null
+++ b/Tools.cpp
@@ -0,0 +1,95 @@
+/*
+ * Collection of basic tools and defines
+ */
+
+#include "Tools.h"
+#include <cstdio>
+
+
+time_t Tools::startTime;
+
+void Tools::StartTimer()
+{
+    startTime = time(NULL);
+}
+
+double Tools::GetTime()
+{
+    time_t stopTime = time(NULL);
+    return difftime( stopTime, startTime );
+}
+
+uchar * Tools::GetRandomString(unsigned min, unsigned max, unsigned &alphabetSize)
+{
+    unsigned len = std::rand() % (max - min) + min;
+    alphabetSize = std::rand() % 26 + 1;
+    uchar* temp = new uchar[len + 2];
+    for (unsigned i = 0; i < len; i++)
+        temp[i] = 97 + std::rand() % alphabetSize;
+    temp[len] = 0u ;temp[len+1] = '\0';
+    return temp;
+}
+
+
+void Tools::PrintBitSequence(ulong *A, ulong len)
+{
+    for(ulong i = 0; i < len; i++)
+        if (GetField(A, 1, i))
+            std::cout << "1";
+        else
+            std::cout << "0";
+    std::cout << "\n";
+}
+
+unsigned Tools::FloorLog2(ulong i)
+{
+    unsigned b = 0;
+    if (i == 0)
+        return 0;
+    while (i)
+    { 
+        b++; 
+        i >>= 1; 
+    }
+    return b - 1;
+}
+
+unsigned Tools::CeilLog2(ulong i)
+{
+    unsigned j = FloorLog2(i);
+    if ((ulong)(1lu << j) != i)
+        return j + 1;
+        
+    return j;
+}
+
+uchar * Tools::GetFileContents(char *filename, ulong maxSize)
+{
+    std::ifstream::pos_type posSize;
+    std::ifstream file ((char *)filename, std::ios::in|std::ios::binary|std::ios::ate);
+    if (file.is_open())
+    {
+        posSize = file.tellg();
+        ulong size = posSize;
+        if (maxSize != 0 && size > maxSize)
+            size = maxSize;
+        char *memblock = new char [size + 1];
+        file.seekg (0, std::ios::beg);
+        file.read (memblock, size);
+        memblock[size] = '\0';
+        file.close();
+       return (uchar *)memblock;
+    }
+    else
+        return 0;
+}
+
+unsigned Tools::bits (ulong n)
+
+   { unsigned b = 0;
+     while (n)
+    { b++; n >>= 1; }
+     return b;
+   }
+
+
diff --git a/Tools.h b/Tools.h
new file mode 100644 (file)
index 0000000..121e46e
--- /dev/null
+++ b/Tools.h
@@ -0,0 +1,110 @@
+/*
+ * Collection of basic tools and defines
+ */
+
+
+#ifndef _TOOLS_H_
+#define _TOOLS_H_
+
+#include <iostream>
+#include <fstream>
+#include <ctime>
+#include <climits>
+
+// Generates an error if __WORDSIZE is not defined
+#ifndef __WORDSIZE
+#error Missing definition of __WORDSIZE; Please define __WORDSIZE in Tools.h!
+#endif
+
+// Check word length on GNU C/C++:
+// __WORDSIZE should be defined in <bits/wordsize.h>, which is #included from <limits.h>
+#if __WORDSIZE == 64
+#   define W 64
+#else
+#   define W 32
+#endif
+
+#define WW (W*2)
+#define Wminusone (W-1)
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+
+class Tools
+{
+private:
+    static time_t startTime;
+public:
+    static void StartTimer();
+    static double GetTime();
+    static uchar * GetRandomString(unsigned, unsigned, unsigned &);
+    static void PrintBitSequence(ulong *, ulong);
+    static uchar * GetFileContents(char *, ulong =0);
+    static ulong * bp2bitstream(uchar *);
+    static unsigned FloorLog2(ulong);
+    static unsigned CeilLog2(ulong);
+    static unsigned bits (ulong);
+
+    static inline void SetField(ulong *A, register unsigned len, register ulong index, register ulong x) 
+    {
+        ulong i = index * len / W, 
+                 j = index * len - i * W;
+        ulong mask = (j+len < W ? ~0lu << j+len : 0) 
+                        | (W-j < W ? ~0lu >> (W-j) : 0);
+        A[i] = (A[i] & mask) | x << j;
+        if (j + len > W) 
+        {
+            mask = ((~0lu) << (len + j - W));  
+            A[i+1] = (A[i+1] & mask)| x >> (W - j);
+        }     
+    }
+    
+    static inline ulong GetField(ulong *A, register unsigned len, register ulong index) 
+    {
+        register ulong i = index * len / W, 
+                       j = index * len - W * i, 
+                       result;
+        if (j + len <= W)
+            result = (A[i] << (W - j - len)) >> (W - len);
+        else 
+        {
+            result = A[i] >> j;
+            result = result | (A[i+1] << (WW - j - len)) >> (W - len);
+        }
+        return result;
+    }
+       
+    
+    static inline ulong GetVariableField(ulong *A, register unsigned len, register ulong index) 
+    {
+       register ulong i=index/W, j=index-W*i, result;
+       if (j+len <= W)
+       result = (A[i] << (W-j-len)) >> (W-len);
+       else {
+             result = A[i] >> j;
+             result = result | (A[i+1] << (WW-j-len)) >> (W-len);
+          }
+       return result;
+    }
+
+    static inline void SetVariableField(ulong *A, register unsigned len, register ulong index, register ulong x) {
+        ulong i=index/W, 
+                    j=index-i*W;
+        ulong mask = (j+len < W ? ~0lu << j+len : 0) 
+                        | (W-j < W ? ~0lu >> (W-j) : 0);
+        A[i] = (A[i] & mask) | x << j;
+        if (j+len>W) {
+            mask = ((~0lu) << (len+j-W));  
+            A[i+1] = (A[i+1] & mask)| x >> (W-j);
+        } 
+    }
+
+
+};
+
+#endif
diff --git a/bittree.cpp b/bittree.cpp
new file mode 100644 (file)
index 0000000..0003461
--- /dev/null
@@ -0,0 +1,989 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#include "bittree.h"
+
+
+using std::cout;
+using std::endl;
+using std::cerr;
+using std::bitset;
+using std::ostream;
+
+//std::ostream& operator<<(std::ostream& os, node& n) { ... }
+
+
+  BVTree::~BVTree(){
+      delete tempbit;
+      tempbit = 0;
+  }
+
+       
+
+/**
+BVTree::BVTree(char *readfile){
+  
+  std::ifstream file(readfile);
+  if (!file) 
+  {
+         cout << "error reading file "<< readfile <<" !\n";
+         exit(EXIT_FAILURE);
+  }
+
+
+  tempnil = (BVNode*) ((RBTree*) this)->nil;
+  tempbit = new bitset<2*logn>;
+  
+
+  char c;
+  bitset<8> bs;
+
+  while (file.get(c)) 
+  {
+         bs = c;
+
+         for (int i=7; i>=0; i--) 
+                 appendBit(bs[i]);
+  }
+  
+}
+**/
+
+
+
+
+void BVTree::iterateReset(){
+       iterate=1;
+       iterateLocal=0;
+       if (root!=nil) {
+               iterateNode = (BVNode*) treeMinimum(root);
+               iterateRank = ((*iterateNode->block)[0])?1:0;
+       } else {
+               iterateNode=getNil();
+               iterateRank = 0;
+               }
+}
+  
+bool BVTree::iterateGetBit(){
+       return ((*iterateNode->block)[iterateLocal]);
+}
+
+ulong BVTree::iterateGetRank(bool bit){
+       if (bit) return iterateRank;
+       else return (iterate - iterateRank);
+}
+
+bool BVTree::iterateNext(){
+       iterate++;
+
+       if (iterate > getPositions()) return false;
+       
+       
+       if (iterateLocal < iterateNode->myPositions-1) {
+               iterateLocal++;
+       } else {
+               // jump to next leaf;
+               iterateNode = (BVNode*) treeSuccessor(iterateNode);
+               #ifndef NDEBUG
+               if (iterateNode==getNil()) {
+                       cout << "iterateNode==getNil()" << endl;
+                       return false;
+               }
+               #endif
+               iterateLocal=0;
+       }
+       if  ((*iterateNode->block)[iterateLocal]) iterateRank++;
+       
+       return true;
+}
+
+ulong BVTree::getPositions(){
+       
+       if (getRoot() == getNil()) return 0;
+       return getRoot()->subTreePositions;
+}
+
+ulong BVTree::getRank(){
+       
+       if (getRoot() == getNil()) return 0;
+       return getRoot()->subTreeRank;
+}
+
+
+void BVTree::printNode(BVNode *n){
+       int commas=(2*logn)/10;
+       int commashift = -1;
+       
+       
+       cout << "address: " << n << endl;
+
+       if (n->block != 0 && (n!=getNil()))
+       {
+               int size = 2*logn + 1 +commas;
+               
+               char *myblock = (char *) new char[size];
+               if (myblock == 0) {
+                       cerr << "error: printNode: myblock == 0" << endl;
+                       exit(0);
+               }
+               
+               //read: i, write: i+commashift
+               for (int i=0; i<(int)(n->myPositions); i++){
+                       if (i%10 == 0) commashift++;
+                       myblock[i+commashift]='0' + (*n->block)[i];
+                       if (i+commashift >= size-1) {
+                               cerr << "X) printNode: array wrong index" << endl;
+                               exit(0);
+                       }
+                       
+               }
+               
+               cout << "size=" << size << endl;
+               cout << "n->myPositions=" << n->myPositions << endl;
+               for (int i=n->myPositions; i<(2*logn); i++){
+                       if ((i%10) == 0) commashift++;
+                       
+                       myblock[i+commashift]='-';
+                       if (i+commashift > size-2) {
+                               cerr << "A) printNode: array wrong index" << endl;
+                               exit(0);
+                       }
+               }
+               
+               commashift = 0;
+               for (int i=10; i < 2*logn; i++){
+                       if (i%10 == 0) 
+                       {
+                               if (i+commashift >= size-2) {
+                                       cerr << "B) printNode: array wrong index" << endl;
+                                       exit(0);
+                               }
+                               
+                               myblock[i+commashift]=',';
+                               commashift++;
+                       }
+
+               }
+
+               myblock[size - 1]='\0';
+               
+               cout << "block: \"" << myblock << "\"" << endl;
+               delete myblock;
+               
+       }
+       else 
+               cout << "block: none" << endl;
+       
+       
+       cout << "myPositions: " << n->myPositions << endl;
+       cout << "myRank: " << n->myRank << endl;
+       cout << "subTreePositions: " << n->subTreePositions << endl;
+       cout << "subTreeRank: " << n->subTreeRank << endl;
+       cout << "color: " << ((n->color==RED)?"RED":"BLACK") << endl;
+       cout << "parent: " << n->getParent() << endl;
+       cout << "left: " << n->getLeft() << endl;
+       cout << "right:" << n->getRight() << endl << endl;
+
+}
+
+
+int BVTree::getTreeMaxDepth(){
+       return getNodeMaxDepth(root);
+}
+
+int BVTree::getTreeMinDepth(){
+       return getNodeMinDepth(root);
+}
+
+
+void BVTree::updateCounters(BVNode *n){
+
+       if (n == getNil()) return;
+
+       ulong lR = 0;
+       ulong lP = 0;
+       ulong rR = 0;
+       ulong rP = 0;
+
+       if (n->getLeft() != getNil()) {
+               lR = n->getLeft()->subTreeRank;
+               lP = n->getLeft()->subTreePositions;
+       }
+
+       if (n->getRight() != getNil()) {
+               rR = n->getRight()->subTreeRank;
+               rP = n->getRight()->subTreePositions;
+       }
+
+       n->subTreeRank     =lR + rR + n->myRank;
+       n->subTreePositions=lP + rP + n->myPositions;
+       
+}
+
+ulong BVTree::getLocalRank(BVNode* n, ulong position){
+       
+       #ifndef NDEBUG
+       if (position > n->myPositions) {
+               cerr << "error: getLocalRank: invalid position in block.\n";
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       
+       *tempbit =*(n->block)<<((2*logn)-position);
+       return tempbit->count(); 
+
+       // old version:
+       //rank = 0;
+       //for (ulong i = 0; i < position; i++) {
+       //      if ((*n->block)[i]) rank++;
+       //}
+}
+
+ulong BVTree::getLocalSelect1(BVNode* n, ulong query){
+       ulong select =0;
+       #ifndef NDEBUG
+       if (query > n->myPositions) {
+               cerr << "error: getLocalSelect1: invalid position in block.\n";
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       ulong i;
+       for (i = 0; select < query; i++) { // TODO is there a faster solution similar to rank ?
+               if ((*n->block)[i]) select++;
+       }
+
+       return i;
+}
+
+ulong BVTree::getLocalSelect0(BVNode* n, ulong query){
+       ulong select =0;
+       #ifndef NDEBUG
+       if (query > n->myPositions) {
+               cerr << "error: getLocalSelect0: invalid position in block.\n";
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       ulong i;
+       for (i = 0; select < query; i++) {
+               if (!(*n->block)[i]) select++;
+       }
+
+       return i;
+}
+
+
+void BVTree::printNode(ulong i){
+       BVNode* x = getRoot();
+
+       #ifndef NDEBUG  
+       if (x == getNil()) { 
+               cerr << "error: printNode(int i): root=NULL.\n";
+               exit(EXIT_FAILURE);
+               
+       }
+       
+       if (i > x->myPositions) {
+               cerr << "error: printNode(int i): invalid position in block.\n";
+               exit(EXIT_FAILURE);
+       }
+       #endif  
+       
+       ulong lP=0;
+       bool search = true;
+       //find the corresponding block:
+       while (search) 
+       {
+               
+               lP = x->getLeft()->subTreePositions;
+               
+
+               if (lP >= i)
+               {
+                       x=x->getLeft();
+               }
+               else if (lP+x->myPositions >= i){
+                       i-=lP;
+                       search = false;
+               }
+               else{
+                       i-=(lP+x->myPositions);
+                       x=x->getRight();
+               }
+       }
+
+
+       cout << "i=" << i << endl;
+       printNode(x);
+}
+
+
+bool BVTree::operator[](ulong i){
+
+       BVNode* x = getRoot();
+       ulong lsP=0;
+       bool search = true;
+       //find the corresponding block:
+       while (search) 
+       {
+               
+               lsP = x->getLeft()->subTreePositions;
+
+               if (lsP >= i)
+               {
+                       #ifndef NDEBUG
+                       if (x->getLeft()==getNil()) {
+                               cout << "lsP: " << lsP << endl;
+                               printNode(x);
+                               cout << "ihhh" << endl;
+                               checkTree();
+                               exit(EXIT_FAILURE);
+                       }
+                       #endif
+                       x=x->getLeft();
+               }
+               else if (lsP+x->myPositions >= i){
+                       i-=lsP;
+                       search = false;
+               }
+               else{
+                       i-=(lsP+x->myPositions);
+                       #ifndef NDEBUG
+                       if (x->getRight()==getNil()) {
+                               cout << "i: " << i << endl;
+                               cout << "lsP: " << lsP << endl;
+                               cout << "x->myPositions: " << x->myPositions << endl;
+                               printNode(x);
+                               cout << "ihhh" << endl;
+                               checkTree();
+                               exit(EXIT_FAILURE);
+                               }
+                       #endif
+                       x=x->getRight();
+               }
+       }
+
+       return (*x->block)[i-1];  
+}
+
+
+ulong BVTree::rank1(ulong i){
+       BVNode* x = getRoot();
+
+       if (i == this->getPositions() + 1) i--;
+       #ifndef NDEBUG  
+       if (i > this->getPositions() ) {
+               cerr << "error: rank1(0): invalid position in bittree: " << i << endl;
+               cerr << "error: rank1(0): this->getPositions(): " <<  this->getPositions()<< endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif  
+
+
+       ulong lsP=0;
+       ulong lsR=0;
+       ulong rank=0;
+       bool search = true;
+       //find the corresponding block:
+       while (search) 
+       {
+               
+               lsP = x->getLeft()->subTreePositions;
+               lsR = x->getLeft()->subTreeRank;
+
+               if (lsP >= i)
+               {// cout << "L" << endl;
+                       x=x->getLeft();
+               }
+               else if (lsP+x->myPositions >= i){
+                       i-=lsP;
+                       rank+=lsR;
+                       search = false;
+               }
+               else{//cout << "R" << endl;
+                       i-=(lsP+x->myPositions);
+                       rank+=(lsR+x->myRank);
+                       x=x->getRight();
+               }
+       }
+
+       rank+=getLocalRank(x, i);
+       return rank;
+}
+
+ulong BVTree::rank0(ulong i){
+       if (this->getPositions() == 0) return 0;
+       return (i-rank1(i));
+}
+
+ulong BVTree::select1(ulong i){
+       BVNode* x = getRoot();
+       ulong select=0;
+       #ifndef NDEBUG  
+       if (i > x->subTreeRank ) {
+               cerr << "error: select1: invalid position in bittree: " << i << endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+
+       ulong lsP=0;
+       ulong lsR=0;
+       bool search = true;
+       //find the corresponding block:
+       while (search) 
+       {
+               // update subTree-counters
+               
+               
+               lsP = x->getLeft()->subTreePositions;
+               lsR = x->getLeft()->subTreeRank;
+
+               if (lsR >= i) {
+                       x=x->getLeft();
+               }
+               else if (lsR+x->myRank >= i) {
+                       i-=lsR;
+                       select+=lsP;
+                       search = false;
+               }
+               else {
+                       i-=(lsR+x->myRank);
+                       select+=(lsP+x->myPositions);
+                       x=x->getRight();
+               }
+       }
+       select+=getLocalSelect1(x, i);
+
+
+
+       return select;
+}
+
+ulong BVTree::select0(ulong i){
+       BVNode* x = getRoot();
+       ulong select=0;
+
+       #ifndef NDEBUG
+       if (i > (x->subTreePositions - x->subTreeRank) || i < 0) {
+               cerr << "error: select1: invalid position in bittree: " << i << endl;
+               exit(EXIT_FAILURE);
+       }       
+       #endif
+
+
+       ulong lsP=0;
+       ulong lsR=0;
+       ulong lmR=0;
+       ulong lmP=0;
+       bool search = true;
+       //find the corresponding block:
+       while (search) 
+       {
+               // update subTree-counters
+               
+               
+               lsP = x->getLeft()->subTreePositions;
+               lsR = x->getLeft()->subTreeRank;
+               lmR = x->getLeft()->myRank;
+               lmP = x->getLeft()->myPositions;
+
+               if (lsP-lsR >= i)
+               {
+                       x=x->getLeft();
+               }
+               else if ((lsP-lsR)+(x->myPositions-x->myRank) >= i){
+                       i-=lsP-lsR;
+                       select+=lsP;
+                       search = false;
+               }
+               else{
+                       i-=((lsP-lsR)+(x->myPositions-x->myRank));
+                       select+=(lsP+x->myPositions);
+                       x=x->getRight();
+               }
+       }
+
+       select+=getLocalSelect0(x, i);
+
+       return select;  
+}
+
+
+void BVTree::updateCountersOnPathToRoot(BVNode *walk){
+
+       while (walk != getNil()) {
+               updateCounters(walk);
+               walk=walk->getParent();
+       }
+}
+
+//deletes the BVNode and all its children, destroys reb-black-tree property!
+void BVTree::deleteNode(BVNode *n){
+       if (n==getNil() || n == 0) return;
+
+       if (n->getLeft() != getNil() && n->getLeft()) deleteNode(n->getLeft());
+       if (n->getRight() != getNil() && n->getRight()) deleteNode(n->getRight());
+       
+       delete n;
+}
+
+
+
+
+// TODO improve by returning bitvalue
+
+void BVTree::deleteBit(ulong i){
+       ulong old_i = i; 
+       BVNode* x = getRoot();
+       bool bit;
+       ulong rank=0;
+
+       #ifndef NDEBUG  
+       if (x == getNil()) {
+               cerr << "error: deleteBit, root is empty\n"; //shouldn't happen
+               exit(EXIT_FAILURE);
+       }
+
+       if (i > x->subTreePositions || i < 1) 
+       {
+               cerr << "error: A, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen
+               exit(EXIT_FAILURE);
+       }
+       #endif  
+
+       ulong lsP=0;
+       ulong lsR=0;
+
+       bool search = true;
+       //find the corresponding block:
+       while (search) 
+       {
+               // update of pointers is not yet possible: call updateCountersOnPathToRoot
+
+               lsP = x->getLeft()->subTreePositions;
+               lsR = x->getLeft()->subTreeRank;
+               
+
+               if (lsP >= i)
+               {
+                       #ifndef NDEBUG
+                       if (x->getLeft()==getNil()) exit(EXIT_FAILURE);
+                       #endif
+                       x=x->getLeft();
+               }
+               else if (lsP+x->myPositions >= i){
+                       i-=lsP;
+                       rank+=lsR;
+                       search = false;
+               }
+               else{
+                       i-=(lsP+x->myPositions);
+                       rank+=(lsR+x->myRank); // for speedup!
+                       #ifndef NDEBUG
+                       if (x->getRight()==getNil()) exit(EXIT_FAILURE);
+                       #endif
+                       x=x->getRight();
+               }
+       }
+
+
+       
+        // now delete the bit from the block x:
+       bit =(*x->block)[i-1];
+
+       // store bit and rank information for speedup
+       lastBitDeleted=bit;
+       rank+=getLocalRank(x, i);
+       lastRank=(bit?rank:old_i-rank);
+       
+       #ifndef NDEBUG  
+       if (i > x->myPositions) {
+               cerr << "error: B, position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       bitset<2*logn> mask;
+       
+
+       if ( i > 1 ) 
+       {
+               mask.set();
+               mask>>=(2*logn - i + 1);
+               mask &= *(x->block);
+               (*(x->block)>>=i)<<=i-1;  // shift bits by one  
+               (*x->block)|=mask;        // restore bits in front of deleted bit
+       } else {
+               *(x->block)>>=1;  // shift bits by one  
+       }
+               
+       x->myPositions--;
+       if (bit) x->myRank--;
+
+       updateCountersOnPathToRoot(x);
+
+       
+       if (x->myPositions == 0){ // if merging was not possible:
+       
+               rbDelete(x, callUpdateCountersOnPathToRoot);
+               delete x;
+
+       } else if (x->myPositions <= logn/2) // merge (and rotate), if necessary:
+       {
+               
+               BVNode *sibling = (BVNode*) treeSuccessor(x);
+               //cout << "try to merge -----------------------------------------" << endl;
+               if (sibling != getNil())
+               {
+                       if (x->myPositions + sibling->myPositions < 2*logn) //merge !
+                       {
+                       //cout << "merge with right sibling -----------------------------------------" << endl;
+                               // move bits from sibling to x:
+
+                               (*sibling->block)<<=x->myPositions;
+                               (*x->block)|=(*sibling->block);
+
+                               x->myPositions+=sibling->myPositions;
+                               x->myRank+=sibling->myRank;
+                               updateCountersOnPathToRoot(x);
+                               rbDelete(sibling, callUpdateCountersOnPathToRoot);
+                               delete sibling;
+                       }
+                       else sibling = getNil(); //to try the left sibling!
+               }
+               else if ((sibling= (BVNode*) treePredecessor(x)) != getNil())
+               {
+                       if (x->myPositions + sibling->myPositions < 2*logn) //merge !
+                       {
+                               // move bits from sibling to x: (more complicated, needs shift!)
+                               //cout << "merge with left sibling -----------------------------------------" << endl;
+                               (*x->block)<<=sibling->myPositions;
+                               x->myPositions+=sibling->myPositions;
+
+                               (*x->block)|=(*sibling->block);
+
+                               x->myRank+=sibling->myRank;
+                               updateCountersOnPathToRoot(x);
+                               rbDelete(sibling, callUpdateCountersOnPathToRoot);
+                               delete sibling;
+                       }
+                       
+
+               
+               } // end else if
+       } // end else if
+
+
+
+}
+
+void BVTree::writeTree(char *writefile){
+       std::ofstream write(writefile);
+       if (!write)
+       {
+               cerr << "error: Error writing file: " << writefile<< "." << endl;
+               exit(EXIT_FAILURE);
+       }
+       writeTree(write);
+}
+
+
+ulong* BVTree::getBits(){
+       BVNode *n = getRoot();
+       ulong blockCounter=0;
+       ulong len = getPositions();
+       
+       int W = sizeof(ulong)*8;
+       
+
+       ulong bitsLength = len/W + ((len%W==0)?0:1);
+
+       ulong *bits = new ulong[bitsLength];
+       
+       ulong i=0; //0 .. bitsLength-1
+       int j=0; //0 .. W-1
+       
+       ulong mask_LeftBitSet = 1;
+       
+       mask_LeftBitSet <<= W-1;
+               
+       n=(BVNode*) treeMinimum(root);
+       while (n != getNil()) {
+               #ifndef NDEBUG
+               if (n->block == 0) {
+                       cerr << "getBits(): block not found !!!" << endl;
+                       exit(EXIT_FAILURE);
+               }
+               #endif
+               for (blockCounter=0; blockCounter < n->myPositions; blockCounter++) {
+                       if (j == W) {
+                               i++;
+                               j=0;
+                       }
+                       bits[i] >>= 1; // set 0
+
+                       if ((*n->block)[blockCounter]) bits[i] |= mask_LeftBitSet; // set 1
+                       j++;    
+               }
+               n=(BVNode*) treeSuccessor(n);
+       }
+       
+       if (i != bitsLength-1) {
+               cout << "last index is wrong" << endl;
+               exit(EXIT_FAILURE);
+       }
+       while (j < W) {
+               bits[i] >>= 1;
+               j++;
+       }
+       
+       return bits;    
+}
+
+void BVTree::writeTree(ostream& stream){
+       BVNode *x;
+       char c=0;
+       ulong cCounter=0;
+       ulong blockCounter=0;
+       
+       x = (BVNode*) treeMinimum(getRoot());
+
+       while (x != getNil()){
+               blockCounter=x->myPositions;
+               while (blockCounter > 0)
+               {
+                       while (blockCounter > 0 && cCounter < 8 )
+                       {
+                               c<<=1;
+                               if ((*x->block)[x->myPositions - blockCounter]) c++;
+                               cCounter++;
+                               blockCounter--;
+                       }
+                       if (cCounter == 8) 
+                       {
+                               stream << c;
+                               cCounter = 0;
+                       }
+               }
+
+
+               x = (BVNode*) treeSuccessor(x);
+       }
+
+       if (cCounter != 0) {
+               while (cCounter < 8 )
+               {
+                       c<<=1; // fill with zeros.
+                       cCounter++;
+               }
+               stream << c;
+       }
+}
+
+
+void BVTree::appendBit(bool bit){
+       ulong pos = 1;
+       if (root != getNil()) pos= getRoot()->subTreePositions + 1;
+       
+       insertBit(bit, pos);
+}
+
+void BVTree::insertBit(bool bit, ulong i){
+       if (getRoot() == getNil())
+       {
+               BVNode *newNode;
+               newNode = new BVNode(getNil());
+               newNode->color=BLACK;
+               bitset<2*logn> *newBlock;
+               newBlock=new bitset<2*logn>;
+               newNode->block = newBlock;
+               setRoot(newNode);
+               
+       }
+
+       BVNode* x = getRoot();
+       
+       #ifndef NDEBUG
+       if (i > x->subTreePositions+1) 
+       {
+               printNode(x);
+               cerr << "error: insertBit: position " << i <<" in block not available, only " << x->myPositions <<" positions.\n"; //shouldn't happen
+               exit(EXIT_FAILURE);
+       }
+       if (i < 1) 
+       {
+               cerr << "error: insertBit: position " << i <<" is not valid.\n"; //shouldn't happen
+               exit(EXIT_FAILURE);
+       }
+       #endif
+
+       ulong lsP=0;
+       
+       
+       while (true) 
+       {
+               // update subTree-counters
+               (x->subTreePositions)++;
+               if (bit) (x->subTreeRank)++;
+               
+               lsP = x->getLeft()->subTreePositions;
+               
+               if (lsP >= i) 
+               {//     cout << "A" << endl;
+                       x=x->getLeft();
+               }
+               else if (lsP+x->myPositions >= i-1){ //-1 to append to last position in current node
+                       i-=lsP;
+                       break;
+               }
+               else{
+                       i-=(lsP+x->myPositions);
+                       x=x->getRight();
+               }
+       }
+
+       // now put the bit into the block x:
+       bitset<2*logn> mask;
+       
+       if (x->myPositions > 0) 
+       {
+               if (i != 1) 
+               {
+                       mask.set();
+                       mask>>=2*logn - i +1; // make 1's at all positions < i
+                       mask &= *(x->block);      // store bits in front of new bit     
+                       (*(x->block)>>=i-1)<<=i;  // shift bits by one
+                       (*x->block)|=mask;        // restore bits in front of new bit
+               }
+               else 
+                 (*x->block)<<=1;
+       }
+       
+       (*x->block)[i-1]=bit;     // set new bit
+
+       // update local pointers
+       (x->myPositions)++;      //update p (leaf)
+       if (bit) (x->myRank)++;  //update r (leaf)
+
+       #ifndef NDEBUG  
+       if ((int)x->myPositions > 2*logn) 
+       {
+               cerr << "error: positions in block already too many.\n"; //shouldn't happen
+               exit(EXIT_FAILURE);
+       }
+       #endif
+
+       // split and rotate, if necessary
+       if ((int)x->myPositions == 2*logn)
+       {
+               //cout << "split !-------------" << endl;
+
+               // new node:            
+               BVNode *newNode;
+               newNode = new BVNode(getNil());
+
+               //find place for new node:
+               BVNode *y;// some parent of new node
+               if (x->getRight() != getNil()) {
+                       y = (BVNode*) this->treeMinimum(x->getRight());
+                       y->setLeft(newNode);
+               } else {
+                       y=x;
+                       y->setRight(newNode);
+               }
+               newNode->setParent(y);
+               
+               //new block:
+               bitset<2*logn> *newBlock;
+               newBlock=new bitset<2*logn>;
+               *newBlock=*x->block>>logn; //copy of bits into the new block
+               
+               mask.set();
+               mask>>=logn;
+               *x->block &= mask; //delete bits that already have been copied to the new block         
+               
+               newNode->block=newBlock;
+               newNode->myRank=(*newNode->block).count();
+               newNode->myPositions=logn;
+               newNode->color=RED;
+
+               //update old node x:
+               x->myRank-=newNode->myRank;     //=(*x->block).count();
+               x->myPositions=logn;
+
+               updateCountersOnPathToRoot(newNode);  // meets x
+               rbInsertFixup(newNode, callUpdateCounters);
+               
+       } // end if
+       
+
+}
+
+void BVTree::checkSubTree(BVNode *n){
+       ulong lP = 0;
+       ulong lR = 0;
+       ulong rP = 0;
+       ulong rR = 0;
+
+       if (n->getLeft()!=getNil()) {
+               lP = n->getLeft()->subTreePositions;
+               lR = n->getLeft()->subTreeRank;
+               if (n->getLeft()->getParent() != n) {
+                       cout << "au"<< endl;
+                       exit(1);
+               }
+               
+       }
+
+       if (n->getRight()!=getNil()) {
+               rP = n->getRight()->subTreePositions;
+               rR = n->getRight()->subTreeRank;
+               if (n->getRight()->getParent() != n) {
+                       cout << "au"<< endl;
+                       exit(1);
+               }               
+       }
+
+       if  ( (n->subTreePositions != (n->myPositions + lP + rP)) ||
+             (n->subTreeRank != (n->myRank + lR + rR)) ){
+               cout << "checkSubTree: error" <<endl;
+               cout << "lP: " << lP << endl;
+               cout << "lR: " << lR << endl;
+               cout << "rP: " << rP << endl;
+               cout << "rR: " << rR << endl;
+               cout << "n->myPositions + lP + rP: " << n->myPositions + lP + rP << endl;
+               cout << "n->myRank + lR + rR: " << n->myRank + lR + rR << endl;
+               printNode(n);
+               printNode(n->getLeft());
+               printNode(n->getRight());
+               exit(1);
+       }       
+
+       if (n->getLeft()!=getNil()) checkSubTree(n->getLeft());
+       if (n->getRight()!=getNil()) checkSubTree(n->getRight());
+}
+
+void callUpdateCounters(RBNode *n, RBTree *T){
+       ((BVTree*)T)->updateCounters((BVNode*) n);
+}
+
+void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){
+       ((BVTree*)T)->updateCountersOnPathToRoot((BVNode*) n);
+}
+
diff --git a/bittree.h b/bittree.h
new file mode 100644 (file)
index 0000000..5344f74
--- /dev/null
+++ b/bittree.h
@@ -0,0 +1,227 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+// Implementation of the Dynamic Bit Vector with Indels problem
+// space: O(n) time: O(log(n))
+// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
+//           Indexes. CPM 2006, Chapter 3.4 Dynamic Structures for Bit Vectors
+//   also: W.-L. Chan, W.-K. Hon, and T.-W. Lam. Compressed index for a dynamic collection
+//           of texts. In Proc. CPM04, LNCS 3109, pages 445-456, 2004 
+
+#ifndef BVTree
+#define BVTree BVTree
+
+#include <iostream>
+#include <bitset>
+#include <cstdlib>
+#include <map>
+#include <stack>
+#include <cmath>
+#include <fstream>
+#include <vector>
+#include <cstdio>
+
+#include "rbtree.h"
+
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+
+#ifndef LOGN
+#define LOGN 64
+#endif
+
+const int logn = LOGN;
+
+//upperBound = 2 * logn;
+//lowerBound = logn / 2;
+
+
+class BVNode;
+class BVTree;
+
+void callUpdateCounters(RBNode *n, RBTree *T);
+void callUpdateCountersOnPathToRoot(RBNode *n, RBTree *T);
+
+class BVNode : public RBNode
+{
+       public:
+       ulong myPositions; // 4*4 bytes = 16 bytes
+       ulong myRank;
+       ulong subTreePositions; //number of positions stored in the subtree rooted at this node
+       ulong subTreeRank;      //number of bits set in the subtree rooted at this node
+
+        std::bitset<2*logn> *block; // 4 bytes
+       
+
+       BVNode()
+               : RBNode(this), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) {
+       }
+
+
+       BVNode(BVNode* n)
+               : RBNode(n), myPositions(0), myRank(0), subTreePositions(0), subTreeRank(0), block(0) {
+       }
+
+       ~BVNode(){
+               delete block;
+                block = 0;
+       }
+
+               
+       BVNode* getParent(){
+               return ((BVNode*) ((RBNode*) this)->parent);
+       }
+
+       BVNode* getLeft(){
+               return ((BVNode*) ((RBNode*) this)->left);
+       }
+
+       BVNode* getRight(){
+               return ((BVNode*) ((RBNode*) this)->right);
+       }
+
+       void setParent(BVNode* n){
+               ((RBNode*) this)->parent=(RBNode*)n;
+       }
+
+       void setLeft(BVNode* n){
+               ((RBNode*) this)->left=(RBNode*)n;
+       }
+
+       void setRight(BVNode* n){
+               ((RBNode*) this)->right=(RBNode*)n;
+       }
+               
+};
+
+
+class BVTree : public RBTree{
+public:
+
+  //Constructors
+  BVTree(){
+       
+       setNil(new BVNode());
+       setRoot(getNil());
+
+       tempnil = getNil();
+
+       tempbit = new std::bitset<2*logn>;
+  }
+
+  //Destructor:
+  ~BVTree();
+
+  bool operator[](ulong);
+
+
+  // inserts bit at position i, countings begins with 1:
+  void appendBit(bool bit);
+  void insertBit(bool bit, ulong i);
+  void deleteBit(ulong i);
+
+  ulong rank0(ulong i);
+  ulong rank1(ulong i);
+  ulong rank(bool b, ulong i){return b?rank1(i):rank0(i);}
+  
+  ulong select0(ulong i);
+  ulong select1(ulong i);
+  ulong select(bool b, ulong i){return b?select1(i):select0(i);}
+
+  void setRoot(BVNode* n){
+       ((RBTree*) this)->root=(RBNode*)n;
+  }
+       
+  BVNode* getRoot(){
+       return ((BVNode*) ((RBTree*) this)->root);
+  }
+
+  void setNil(BVNode* n){
+       tempnil = n;
+       ((RBTree*) this)->nil=(RBNode*)n;
+  }
+
+  BVNode* getNil(){
+       return tempnil;
+       
+  }
+
+  // write bits back into a stream:  
+  ulong* getBits();
+  void writeTree(char *writefile); 
+  void writeTree(std::ostream& stream); //e.g. stream = cout
+
+  int getTreeMaxDepth();
+  int getTreeMinDepth();
+  ulong getPositions();
+  ulong getRank();
+  
+  void iterateReset();
+  bool iterateGetBit();
+  bool iterateNext();
+  ulong iterateGetRank(bool bit);
+  
+  bool getLastBitDeleted(){return lastBitDeleted;}
+  ulong getLastRank(){return lastRank;}
+  
+  void checkSubTree(BVNode *n);
+
+  void updateCounters(BVNode *n);
+  void updateCountersOnPathToRoot(BVNode *walk);
+  
+  //debug:
+  void printNode(ulong i);
+
+protected:
+
+  ulong iterate;
+  ulong iterateLocal;
+  ulong iterateRank;
+  BVNode *iterateNode;
+
+  BVNode *tempnil;
+
+  bool lastBitDeleted;
+  ulong lastRank;
+  
+
+  std::bitset<2*logn> *tempbit;
+
+  // content of BVNode, for debugging:
+  void printNode(BVNode *n);
+  
+  // other operations:
+  ulong getLocalRank(BVNode* n, ulong position);
+  ulong getLocalSelect0(BVNode* n, ulong query);
+  ulong getLocalSelect1(BVNode* n, ulong query);
+  
+  void deleteNode(BVNode *n);
+  void deleteLeaf(BVNode *leaf);
+};
+
+
+#endif
+
diff --git a/dependencies.mk b/dependencies.mk
new file mode 100644 (file)
index 0000000..4e40bc8
--- /dev/null
@@ -0,0 +1,12 @@
+BitRank.o: BitRank.cpp BitRank.h Tools.h
+CSA.o: CSA.cpp CSA.h BitRank.h Tools.h dynFMI.h bittree.h rbtree.h \
+  handle.h pos.h
+TextCollection.o: TextCollection.cpp TextCollection.h Tools.h
+Tools.o: Tools.cpp Tools.h
+bittree.o: bittree.cpp bittree.h rbtree.h
+dynFMI.o: dynFMI.cpp dynFMI.h bittree.h rbtree.h handle.h pos.h
+handle.o: handle.cpp handle.h rbtree.h pos.h
+pos.o: pos.cpp pos.h rbtree.h handle.h
+rbtree.o: rbtree.cpp rbtree.h
+testTextCollection.o: testTextCollection.cpp CSA.h BitRank.h Tools.h \
+  dynFMI.h bittree.h rbtree.h handle.h pos.h TextCollection.h
diff --git a/dynFMI.cpp b/dynFMI.cpp
new file mode 100644 (file)
index 0000000..dd4e989
--- /dev/null
@@ -0,0 +1,807 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *      *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <stack>
+#include <queue>
+#include <functional>
+#include <algorithm>
+
+#include "dynFMI.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+using std::ios;
+using std::priority_queue;
+using std::vector;
+
+
+
+// -------- DynFMI --------
+
+DynFMI::~DynFMI(){
+       deleteDynFMINodes(root);
+        root = 0;
+       delete[] leaves; //free(leaves) // ???;
+#if SAMPLE!=0
+       delete SampledSAPositionsIndicator;
+#endif
+       delete handle;
+       delete pos;
+#if SAMPLE!=0
+       delete sampledSATree;
+#endif
+}
+
+void DynFMI::deleteDynFMINodes(WaveletNode *n){
+       if (n->right) deleteDynFMINodes(n->right);
+       if (n->left) deleteDynFMINodes(n->left);
+       delete n;
+}
+
+DynFMI::DynFMI(uchar *text, ulong x, ulong n, bool del){
+       initEmptyDynFMI(text, x, n);
+       if (del) delete [] text;
+}
+
+DynFMI::DynFMI(char *readfile){
+
+       std::ifstream file(readfile);
+       if (!file)
+       {
+               cerr << "error reading file: " << readfile << endl;
+               exit(EXIT_FAILURE);
+       }
+       
+       file.seekg(0, ios::end);
+       ulong n = file.tellg();
+       file.seekg(0, ios::beg);
+       
+       uchar *text = new uchar[n];
+       
+       char c;
+       ulong i=0;
+       
+       
+       while (file.get(c))
+       {
+               text[i]=c;
+               i++;
+       }
+       file.close();
+       
+       initEmptyDynFMI(text, 1, n);
+       
+       
+       delete text;
+       
+}
+
+void DynFMI::iterateReset(){
+       iterate = 1;
+       recursiveIterateResetWaveletNode(root);
+}
+
+void DynFMI::recursiveIterateResetWaveletNode(WaveletNode *w){
+       w->bittree->iterateReset();
+       
+       if (w->left) recursiveIterateResetWaveletNode(w->left);
+       if (w->right) recursiveIterateResetWaveletNode(w->right);
+}
+
+
+bool DynFMI::iterateNext(){
+       iterate++;
+       return !(iterate > getSize());
+}
+
+uchar DynFMI::iterateGetSymbol(){
+
+       ulong i = iterate;
+       WaveletNode *walk = root;       
+       bool bit;
+               
+       while (true) {
+               
+               bit = walk->bittree->iterateGetBit(); // TODO improve
+               i=walk->bittree->iterateGetRank(bit);
+               
+               walk->bittree->iterateNext();
+               
+               
+               if (bit) { //bit = 1
+                       if (walk->right == 0) return walk->c1;
+                       walk=walk->right;
+               } else { // bit = 0
+                       if (walk->left == 0) return walk->c0;
+                       walk=walk->left;
+               }
+               
+               
+       } // end of while
+       
+}
+
+
+uchar* DynFMI::getBWT(){
+       ulong n = root->bittree->getPositions();
+       n++;
+       uchar *text = new uchar[n];
+       
+       bool data=true;
+       // old slow version:
+       //for (ulong i=1; i <= root->bittree->getPositions(); i++)
+       //      text[i-1]=(*this)[i];
+       
+       ulong i = 0;
+       
+       iterateReset();
+       
+       while (data) {
+               text[i] = iterateGetSymbol();   
+               data = iterateNext();
+               i++;
+       }
+       text[n-1]=0;
+       
+       return text;
+}
+
+std::ostream& DynFMI::getBWTStream(std::ostream& stream){
+       ulong n = root->bittree->getPositions();
+       n++;
+       //uchar *text = new uchar[n];
+       
+       bool data=true;
+       // old slow version:
+       //for (ulong i=1; i <= root->bittree->getPositions(); i++)
+       //      text[i-1]=(*this)[i];
+       
+       ulong i = 0;
+       
+       iterateReset();
+       
+       while (data) {
+               //text[i] = iterateGetSymbol(); 
+               stream << iterateGetSymbol();
+               data = iterateNext();
+               i++;
+       }
+       //text[n-1]=0;
+
+       return stream;
+}
+
+//void DynFMI::printDynFMIContent(ostream& stream){
+//     uchar c;
+//     for (ulong i=1; i<=getSize(); i++) 
+//     {
+//             c =(*this)[i];
+//             if (c==0) c= '#';
+//             stream << c;
+//     }
+
+
+void DynFMI::deleteLeaves(WaveletNode *node){
+       bool leaf = true;
+
+       if (node->left) {
+               // internal node
+               leaf = false;
+               deleteLeaves(node->left);
+               
+       }
+       if (node->right){
+               leaf = false;
+               deleteLeaves(node->right);
+       } 
+       
+       if (leaf) {
+               // is a leaf, delete it!
+               if (node->parent) {
+                       if (node==node->parent->left) node->parent->left=0;
+                               else node->parent->right=0;
+               }
+               delete node;
+       }
+}
+
+void DynFMI::makeCodes(ulong code, int bits, WaveletNode *node){
+       #ifndef NDEBUG
+       if (node == node->left) {
+               cout << "makeCodes: autsch" << endl;
+               exit(0);
+               }
+       #endif
+
+       if (node->left) {
+               makeCodes(code | (0 << bits), bits+1, node->left);
+               makeCodes(code | (1 << bits), bits+1, node->right);
+       } else {
+               codes[node->c0] = code;
+               codelengths[node->c0] = bits+1;
+       }
+}
+
+void DynFMI::appendBVTrees(WaveletNode *node){
+       node->bittree = new BVTree();
+
+       if (node->left) appendBVTrees(node->left);
+       if (node->right) appendBVTrees(node->right);
+}
+
+void DynFMI::initEmptyDynFMI(uchar *text, ulong x, ulong n){
+       // pointers to the leaves for select
+       leaves = (WaveletNode**) new WaveletNode*[256];
+       for(int j=0; j<256; j++) leaves[j]=0;
+
+
+       ulong i;
+       for(i=0; i < n; i++) {
+               if (text[i]==0) {
+                       cerr << "Error: unsigned char 0 is reserved." << endl;
+                       exit(EXIT_FAILURE);
+               }
+               
+               if (leaves[text[i]]==0) {
+                       leaves[text[i]] = new WaveletNode(text[i]); 
+               }
+               leaves[text[i]]->weight++; 
+       }
+       
+       // separation symbol:
+       leaves[0] = new WaveletNode((uchar)0); 
+       leaves[0]->weight=x;
+       
+       // Veli's approach:
+       priority_queue< WaveletNode*, vector<WaveletNode*>, std::greater<WaveletNode*> > q;
+       
+       
+       for(int j=255; j>=0; j--){
+               if (leaves[j]!=0) {
+                       q.push( (leaves[j]) );
+               }
+               codes[j] = 0;
+               codelengths[j] = 0;
+       }
+       
+       // creates huffman shape:
+       while (q.size() > 1) {
+               
+               WaveletNode *left = q.top();
+               q.pop();
+               
+               WaveletNode *right = q.top();
+               q.pop();
+               
+               q.push(  new WaveletNode(left, right) );
+       }       
+       
+
+       root = q.top();
+       q.pop();
+       
+                       
+       makeCodes(0,0, root);   // writes codes and codelengths
+
+       
+       // merge leaves (one leaf represent two characters!)
+       for(int j=0; j<256; j++){
+       
+               if (leaves[j]) {
+               
+                       if (leaves[j]->parent->left==leaves[j]) {
+                               leaves[j]->parent->c0=j;
+                       } else {
+                               leaves[j]->parent->c1=j;
+                       }
+                       leaves[j]=leaves[j]->parent; // merge
+               }
+       }
+       
+       deleteLeaves(root);
+       appendBVTrees(root);
+       
+       // array C needed for backwards search
+       for(int j=0; j<256+256; j++) C[j] = 0;
+       
+#if SAMPLE!=0
+       this->SampledSAPositionsIndicator = new BVTree();
+       this->sampledSATree = new SampledSATree();
+#endif
+       
+       this->pos = new Pos(sampleInterval);
+       this->handle = new Handle();
+       
+       pos->handle=this->handle;
+       handle->pos=this->pos;
+       
+}
+
+void DynFMI::append(uchar *text, ulong length){
+       ulong i;
+       ulong start = root->bittree->getPositions();
+       for (i = start; i<start+length; i++)
+               insert(text[i-start], i+1);
+}
+
+
+void DynFMI::append(uchar c){
+       insert(c, root->bittree->getPositions()+1);
+}
+
+void DynFMI::insert(uchar c, ulong i){
+       #ifndef NDEBUG
+       if (codelengths[c]==0) {
+               cerr << "error: Symbol \"" << c << "\" (" << (int)c << ") is not in the code table!" << endl;;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       
+       ulong level = 0;
+       ulong code = codes[c];
+
+       bool bit;
+       WaveletNode *walk = root;       
+               
+       while (walk) {
+               
+               bit = ((code & (1u << level)) != 0); 
+               
+               walk->bittree->insertBit(bit,i); // TODO improve
+               i=walk->bittree->rank(bit, i);
+
+               if (bit) { //bit = 1
+                       walk=walk->right;
+               } else { // bit = 0
+                       walk=walk->left;
+               }
+               
+               level++;                
+       } // end of while
+       
+       int j = 256+c;
+       while(j>1) {
+               C[j]++;
+               j=binaryTree_parent(j);
+               }
+       C[j]++; 
+       
+}
+
+void DynFMI::deleteSymbol(ulong i){
+       WaveletNode *walk = root;       
+       bool bit;
+       uchar c;
+       while (true) {
+
+               // original slow version:
+               //bit = (*walk->bittree)[i];
+               //old_i = i;            
+               //i=walk->bittree->rank(bit, i);        
+               //walk->bittree->deleteBit(old_i);
+
+
+               walk->bittree->deleteBit(i);
+               bit=walk->bittree->getLastBitDeleted();
+               i=walk->bittree->getLastRank();
+               
+               if (bit) { //bit = 1
+                       if (walk->right == 0) {
+                               c = walk->c1;
+                               break;
+                               }
+                       walk=walk->right;
+               } else { // bit = 0
+                       if (walk->left == 0) {
+                               c = walk->c0;
+                               break;
+                       }
+                       walk=walk->left;
+               }
+               
+               
+       } // end of while
+       
+       int j = 256+c;
+       while(j>1) {
+               C[j]--;
+               j=binaryTree_parent(j);
+               }
+       C[j]--; 
+       
+       
+}
+
+
+uchar DynFMI::operator[](ulong i){
+       WaveletNode *walk = root;       
+       bool bit;
+               
+       while (true) {
+               
+               bit = (*walk->bittree)[i]; //TODO improve by reducing
+               i=walk->bittree->rank(bit, i);
+
+               if (bit) { //bit = 1
+                       if (walk->right == 0) return walk->c1;
+                       walk=walk->right;
+               } else { // bit = 0
+                       if (walk->left == 0) return walk->c0;
+                       walk=walk->left;
+               }
+               
+               
+       } // end of while
+       cout << endl;
+}
+
+ulong DynFMI::rank(uchar c, ulong i){
+       if (i==0) return 0;
+
+       ulong level = 0;
+       ulong code = codes[c];
+       
+       
+       bool bit;
+       WaveletNode *walk = root;       
+               
+       while (true) {
+               
+               bit = ((code & (1u << level)) != 0);
+               
+               i=walk->bittree->rank(bit, i);
+               if (bit) { //bit = 1
+                       if (walk->right == 0) return i;
+                       walk=walk->right;
+               } else { // bit = 0
+                       if (walk->left == 0) return i;
+                       walk=walk->left;
+               }
+       
+               level++;                
+       } // end of while
+       
+       cerr << "error: DynFMI::rank: left while loop" << endl;
+       exit(EXIT_FAILURE);
+       return 0; //never
+}
+
+ulong DynFMI::select(uchar c, ulong i){
+       
+       WaveletNode *walk = leaves[c];  
+       
+       bool bit = (walk->c1==c);
+       
+       while (walk->parent) {
+               i=walk->bittree->select(bit, i);
+               
+               bit = (walk == walk->parent->right);
+               walk=walk->parent;
+       } // end of while
+       
+       i=walk->bittree->select(bit, i);
+
+       return i;
+}
+
+
+ulong DynFMI::count(uchar *pattern){
+
+       ulong i = strlen((char*)pattern);
+       ulong sp = 1;
+
+       ulong ep = getSize();
+       uchar s;
+       while ((sp <= ep) && (i >=1)) {
+               s=pattern[i-1];
+               sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
+               ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
+               
+               i--;
+       }
+       return ep-sp +1;
+}
+
+#if SAMPLE!=0
+ulong* DynFMI::locate(uchar *pattern){
+
+       ulong i = strlen((char*)pattern);
+       ulong sp = 1;
+
+       ulong ep = getSize();
+       uchar s;
+       while ((sp <= ep) && (i >=1)) {
+               s=pattern[i-1];
+               sp=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, sp-1) + 1;
+               ep=(ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,ep);
+               
+               i--;
+       }
+
+       ulong numberOfMatches = ep-sp +1;
+       ulong *matches = new ulong[2*numberOfMatches + 1];
+       matches[0] = numberOfMatches;
+
+       ulong diff;
+       ulong k = 1;
+       cout << "----------------------" << endl;       
+       for (ulong j=sp; j <= ep; j++) {
+
+               i=j;
+               diff = 0;
+               while (!(*SampledSAPositionsIndicator)[i]) {
+                       diff++;
+                       s=(*this)[i];
+                       i= (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s, i);
+       
+               }
+       
+
+               sampledSATree->getSample(SampledSAPositionsIndicator->rank(true, i));
+
+               //cout << "found: (" << sampledSATree->handle  << ", " << sampledSATree->sampledSAValue + diff << ")"<< endl;
+               matches[k++] = sampledSATree->handle;
+               matches[k++] = sampledSATree->sampledSAValue + diff;
+       }
+
+       return matches;
+}
+#endif
+
+// size must include endmarker!
+ulong DynFMI::addText(uchar const * str, ulong n){
+       ulong i;
+#if SAMPLE!=0
+       bool sample = false;
+#endif
+       i=pos->getSize()+1;
+       
+       ulong key = pos->appendText(n);
+
+        // Adding an empty text (end-marker)
+        if (n == 1)
+        {
+            insert(str[0], i);
+            //if (del) delete [] str;
+            return key;
+        }
+
+       insert(str[n-2],i); // insert second last character, corresponds to suffix of length 1
+#if SAMPLE!=0
+       sample = (n-1 % sampleInterval == 0);
+       SampledSAPositionsIndicator->insertBit(sample, i);      
+       if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+       for (ulong t=n-2; t > 0; t--) {
+               i= 1+getNumberOfSymbolsSmallerThan(str[t]) + rank(str[t],i);
+               insert(str[t-1],i);
+#if SAMPLE!=0
+               sample = ((t) % sampleInterval == 0);
+               SampledSAPositionsIndicator->insertBit(sample, i);
+
+               if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+       }
+       
+
+       i= 1+ getNumberOfSymbolsSmallerThan(str[0]) + rank(str[0],i);
+       insert(str[n-1],i);
+#if SAMPLE!=0
+       SampledSAPositionsIndicator->insertBit(true, i); 
+       sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif 
+//     if (del)  delete [] str;
+       
+       return key;
+}
+
+
+ulong DynFMI::addTextFromFile(char* filename, ulong n){
+       ulong i;
+#if SAMPLE!=0
+       bool sample = false;
+#endif
+       i=pos->getSize()+1;
+
+
+       std::ifstream str(filename, ios::binary);
+       if (!str)
+       {
+               cerr << "error reading file: " << filename << endl;
+               exit(EXIT_FAILURE);
+       }
+
+       char c;
+       //char oldc;
+
+
+       ulong buf_len; //1048576; //10MB
+       char * buffer;
+       ulong buf_pos;
+
+       if (BUFFER==0) buf_len = n;
+               else buf_len = BUFFER;
+
+       buffer = new char[buf_len];
+       buf_pos = (n-2) / buf_len;
+       str.seekg(buf_pos*buf_len);
+       str.read(buffer, buf_len);
+       str.clear(std::ios_base::goodbit);
+
+
+       ulong key = pos->appendText(n);
+       
+       //str.seekg(n-2);
+
+       //c=str.get();
+       c=buffer[(n-2)%buf_len];
+       insert(c,i); // insert second last character, corresponds to suffix of length 1
+#if SAMPLE!=0
+       sample = (n-1 % sampleInterval == 0);
+       SampledSAPositionsIndicator->insertBit(sample, i);      
+       if (sample) sampledSATree->insertSample(n-1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+       if ((n-2)%buf_len == 0) {
+               c=buffer[0];
+               buf_pos--;
+               str.seekg(buf_pos*buf_len);
+               str.read(buffer, buf_len);
+       }
+       for (ulong t=n-2; t > 0; t--) {
+
+               i= 1+getNumberOfSymbolsSmallerThan(c) + rank(c,i);
+
+               c=buffer[(t-1)%buf_len];
+               insert(c,i);
+
+               //check buffer
+               if (((t-1)%buf_len == 0) && ((t-1)!=0)) {
+#ifndef NDEBUG
+                       if (buf_pos == 0) {
+                               cerr << "buf_pos too small" << endl;
+                               exit(0);
+                       }
+#endif
+                       buf_pos--;
+                       str.seekg(buf_pos*buf_len);
+                       str.read(buffer, buf_len);
+               }
+
+#if SAMPLE!=0
+               sample = ((t) % sampleInterval == 0);
+               SampledSAPositionsIndicator->insertBit(sample, i);
+
+               if (sample) sampledSATree->insertSample(t, key, SampledSAPositionsIndicator->rank(true,i));
+#endif
+       }
+
+       i= 1+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
+       insert(c,i);
+
+#if SAMPLE!=0
+       SampledSAPositionsIndicator->insertBit(true, i); 
+       sampledSATree->insertSample(1, key, SampledSAPositionsIndicator->rank(true,i));
+#endif 
+
+       str.close();
+
+       return key;
+}
+
+
+uchar* DynFMI::retrieveText(ulong key){
+       cout << "key: " << key << endl;
+       uchar *text=0;
+       // TODO access two times, too much
+       ulong i=handle->getPos(key); //=bwtEnd;
+       if (i == 0 ) {
+               return text;
+       }
+
+       ulong n=pos->getTextSize(i);
+       
+       text = new uchar[n]; // last byte 0 for cout
+       
+       uchar c;
+       
+//TODO better  
+       for (ulong t=n-2; t < n; t--) {
+               c = (*this)[i];
+               text[t]=(c==0?'#':c);
+               
+               i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i);
+       }
+
+       #ifndef NDEBUG          
+       for (ulong i=0; i< n-1 ; i++) if (text[i]==0) text[i]='-';
+       #endif  
+       
+       text[n-1] = 0;
+       return text;
+}
+
+
+bool DynFMI::deleteText(ulong key){
+       // TODO access two times, can do better
+       ulong i=handle->getPos(key); //=bwtEnd;
+       if (i == 0) return false;
+
+       ulong n=pos->getTextSize(i);
+       ulong *text = new ulong[n]; 
+       uchar c;
+       
+       for (ulong t=n-1; t <n; t--) {
+               c = (*this)[i];
+               text[t]=i;
+               i= 0+ getNumberOfSymbolsSmallerThan(c) + rank(c,i); // TODO improve by lastrank!
+       }
+
+
+        std::sort(text, text + n);
+
+
+       for (i=n-1; i<n; i--) {
+               this->deleteSymbol(text[i]); 
+#if SAMPLE!=0
+               SampledSAPositionsIndicator->deleteBit(text[i]);                
+//TODO samples ?
+#endif
+       }
+       delete[] text;
+       handle->deleteKey(key);
+       
+               
+       return true;
+}
+
+ulong DynFMI::getNumberOfSymbolsSmallerThan(uchar c){
+       int j = 256+c;
+       ulong r=0;
+       while(j>1) {
+               if (binaryTree_isRightChild(j)) 
+                       r += C[binaryTree_left(binaryTree_parent(j))];
+               
+               j=binaryTree_parent(j);
+       }
+       return r;
+}
+
+
+
+
+void DynFMI::printDynFMIContent(std::ostream& stream){
+       uchar c;
+       for (ulong i=1; i<=getSize(); i++) 
+       {
+               c =(*this)[i];
+               if (c==0) c= '#';
+               stream << c;
+       }
+}
+
+
+
+
diff --git a/dynFMI.h b/dynFMI.h
new file mode 100644 (file)
index 0000000..41ef992
--- /dev/null
+++ b/dynFMI.h
@@ -0,0 +1,233 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *      *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+// ------ Dynamic Sequence with Indels -----
+// uses huffman shaped wavelet tree
+// space: O(nH(0)) operation time: O(log n log \sigma)
+// papers: V. Maekinen, G. Navarro. Dynamic Entropy-Compressed Sequences and Full-Text
+//           Indexes. CPM 2006, Chapter 3.6 
+
+
+#include <iostream>
+#include <cstdlib>
+#include <fstream>
+#include <math.h>
+#include <bitset>
+
+#ifndef SAMPLE
+//#define SAMPLE 1024
+#define SAMPLE 0 // No sampling.
+#endif
+
+#ifndef BUFFER
+#define BUFFER 1048576
+#endif
+
+
+#include "bittree.h"
+#include "handle.h"
+#include "pos.h"
+
+#if SAMPLE!=0
+#include "ssatree.h"
+#endif
+
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+
+
+const ulong sampleInterval = SAMPLE;
+
+class WaveletNode{
+       public:
+       
+       WaveletNode *left;
+       WaveletNode *right;
+       WaveletNode *parent;
+       ulong weight; // used only while construction
+       uchar c0;      // used also while construction
+       uchar c1;
+       
+       BVTree *bittree;
+       
+       WaveletNode(uchar c)
+               : left(0), right(0), parent(0), weight(0), c0(c), bittree(0){}
+
+       WaveletNode(WaveletNode *left, WaveletNode *right)
+               : left(left), right(right), parent(0), bittree(0) {
+               weight = left->weight + right->weight;
+               left->parent = this;
+               right->parent = this;
+       }
+       
+       ~WaveletNode(){
+               delete bittree;
+                bittree = 0;
+       }
+       
+       bool operator>(const WaveletNode &a) const {
+               return (weight > a.weight);
+       }
+       
+       
+};
+
+namespace std
+{
+
+template<> struct greater<WaveletNode*>
+  {
+    bool operator()(WaveletNode const* p1, WaveletNode const* p2)
+    {
+      if(!p1)
+        return false;
+      if(!p2)
+        return true;
+      return *p1 > *p2;
+    }
+  };
+}
+  
+  
+class DynFMI{
+       public:
+               
+               // read char distribution from string
+               // x=expected number of texts, n=length of text, del=delete text afterwards
+               DynFMI(uchar *text, ulong x, ulong n, bool del);
+               // read char distribution from file
+               DynFMI(char *readfile);
+               
+               ~DynFMI();
+               
+                               
+               
+               //n=length of str, del=delete text afterwards
+               ulong addText(uchar const * str, ulong n);
+               ulong addTextFromFile(char* filename, ulong n);
+               uchar* retrieveText(ulong text);
+               bool deleteText(ulong key);
+               
+               ulong count(uchar *pattern);
+#if SAMPLE!=0
+               ulong* locate(uchar *pattern);
+#endif
+               
+               //LF(i)-mapping: C[L[i]]+rank_L[i](L,i)
+               ulong LFmapping(ulong i) {
+                       uchar s=(*this)[i];
+                       return (ulong)getNumberOfSymbolsSmallerThan(s) + rank(s,i);
+                       }
+               
+
+               ulong getSize() {return root->bittree->getPositions();}
+               ulong getCollectionSize(){ return pos->getSize();}
+               uchar* getBWT();
+                std::ostream& getBWTStream(std::ostream& stream);
+
+               uchar operator[](ulong i);
+               void printDynFMIContent(std::ostream& stream);
+               ulong* getKeys() {return handle->getKeys(); }   
+               ulong getPos(ulong i)
+                { return handle->getPos(i); }
+               
+       private:
+               WaveletNode *root;
+               WaveletNode **leaves; // needed for construction and select
+               
+               ulong codes[256];
+               int codelengths[256];
+               ulong C[256+256];
+               
+#if SAMPLE!=0
+               BVTree *SampledSAPositionsIndicator;
+#endif         
+               ulong iterate;
+                               
+               Handle *handle;
+               Pos *pos;
+#if SAMPLE!=0
+               SampledSATree *sampledSATree;
+#endif
+
+
+               ulong rank(uchar c, ulong i);
+               ulong select(uchar c, ulong i);
+       
+               void insert(uchar c, ulong i);
+               void append(uchar c);
+               void append(uchar *text, ulong length);
+               void deleteSymbol(ulong i);
+
+
+
+               // functions
+               ulong getNumberOfSymbolsSmallerThan(uchar c);
+               
+               void initEmptyDynFMI(uchar *text, ulong x, ulong n);
+               
+               void makeCodes(ulong code, int bits, WaveletNode *node);
+               void deleteLeaves(WaveletNode *node);
+               void appendBVTrees(WaveletNode *node);
+               
+               void deleteDynFMINodes(WaveletNode *n);
+               
+               //Iterator (public??)
+               void iterateReset();
+               bool iterateNext();
+               uchar iterateGetSymbol();
+               void recursiveIterateResetWaveletNode(WaveletNode *w);
+               
+
+               // small help functions
+               double log2(double x){
+                       return (log10(x) / log10((double)2));
+               }
+               
+               int binaryTree_parent(int i){
+                       return (int)floor((double)i/2);
+               }
+
+               int binaryTree_left(int i){
+                       return 2*i;
+               }
+
+               int binaryTree_right(int i){
+                       return 2*i + 1;
+               }
+
+               bool binaryTree_isLeftChild(int i){
+                       return (i%2==(int)0);
+               }
+
+               bool binaryTree_isRightChild(int i){
+                       return (i%2==(int)1);
+               }
+               
+};
+
+
+
+
diff --git a/handle.cpp b/handle.cpp
new file mode 100644 (file)
index 0000000..9568dd4
--- /dev/null
@@ -0,0 +1,288 @@
+
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <iostream>
+#include <cstdlib>
+
+#include "handle.h"
+
+using std::cerr;
+using std::cout;
+using std::endl;
+
+ulong Handle::getPos(ulong key){
+       HandleNode *n = getKey(key);
+       if (n==0) return 0;
+
+       PosNode *p = n->posNode;
+
+       #ifndef NDEBUG
+       if (p==pos->getNil()) {
+               cerr << "error: Handle::getPos: no PosNode found." << endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       return (this->pos->getPos(p));
+}
+
+ulong* Handle::getEndPositions(){
+       ulong num = getSize();
+       ulong i = 0;
+       
+       if (this->root == getNil()) return 0;
+       
+       ulong *positions= new ulong[num];
+       
+       
+       HandleNode *n = (HandleNode*) treeMinimum(root);
+       while (n != getNil()) {
+               
+               //printf("node: %d", n);
+            positions[i] = this->pos->getPos(n->posNode);
+            i++;
+               n = (HandleNode*) treeSuccessor((RBNode*) n);
+       }
+       return positions;
+}
+
+
+ulong* Handle::getKeys(){
+       ulong num = getSize();
+       ulong i = 0;
+       
+       if (this->root == getNil()) return 0;
+       
+       ulong *keys= new ulong[num];
+       
+       
+       HandleNode *n = (HandleNode*) treeMinimum(root);
+       #ifndef NDEBUG
+       HandleNode *oldn;
+       #endif
+       while (n != getNil()) {
+               
+               //printf("node: %d", n);
+               keys[i] = n->key;
+               i++;
+               #ifndef NDEBUG
+               oldn = n;
+               #endif
+               n = (HandleNode*) treeSuccessor((RBNode*) n);
+               #ifndef NDEBUG
+               if (n == oldn) {
+                               cerr << "old!" << endl;
+                               exit(EXIT_FAILURE);
+                               }
+               #endif
+       }
+       #ifndef NDEBUG
+       if (num != i) {
+               cerr << "num: " << num << endl;
+               cerr << "i: " << i << endl;
+               cerr << "error: Handle::getKeys: key number was not correct." << endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       return keys;
+}
+
+void Handle::deleteKey(ulong key){
+
+       HandleNode *n = getKey(key);
+       this->pos->deletePosNode(n->posNode);
+       
+       rbDelete( n, callHandleUpdateCountersOnPathToRoot);
+       delete n;
+}
+
+
+
+
+HandleNode* Handle::getKey(ulong key){
+       HandleNode *n= getRoot();
+       while (n != getNil()) {
+               if (n->key == key) return n;
+               if (n->getLeft() != getNil()) {
+                       if (key <= n->getLeft()->maxkey) n=n->getLeft();
+                       else n=n->getRight();
+               }
+               else n=n->getRight();
+       }
+       
+                       
+       return 0;
+}
+
+void Handle::updateCountersOnPathToRoot(HandleNode *n) {
+       while (n != getNil()) {
+               updateCounters(n);
+               
+               n = n->getParent();
+       }
+}
+
+
+void Handle::updateCounters(HandleNode *n) {
+       #ifndef NDEBUG
+       if (n == getNil()) {
+               cerr << "error: Handle::updateCounters" << endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       n->subtreeSize=1;
+       
+       
+       n->subtreeSize += n->getLeft()->subtreeSize;
+       
+       
+       if ( n->getRight() != getNil()) {
+               n->subtreeSize += n->getRight()->subtreeSize;
+               n->maxkey=n->getRight()->maxkey;
+       } else n->maxkey=n->key;
+       
+}
+
+               
+HandleNode* Handle::getNewKey(){
+       HandleNode *n= getRoot();
+       
+       if (n == getNil()) {
+               //tree is empty
+               HandleNode *newLeaf= new HandleNode(getNil(),1); // 1=smallest key, 0 = error
+               setRoot(newLeaf);
+               ((RBNode*)newLeaf)->color=BLACK;
+               
+               return newLeaf;
+       }
+       
+       if (n->maxkey == n->subtreeSize) {
+               // tree is full
+               HandleNode *last = (HandleNode*) treeMaximum(n);
+               
+               
+               HandleNode *newLeaf= new HandleNode(getNil(), n->maxkey+1);
+               
+               last->setRight(newLeaf);
+               newLeaf->setParent(last);
+               
+               if (newLeaf->getParent() != getNil()) updateCountersOnPathToRoot(newLeaf->getParent());
+               
+               rbInsertFixup(newLeaf, callHandleUpdateCounters);
+               
+               return newLeaf;
+       }
+       
+       HandleNode *newNode;
+       ulong smallerkeys = 0;
+       ulong lmax; //getLeft()->maxkey
+       ulong lsub; //n->getLeft()->subtreeSize
+       while (true) {
+               cout << "search first free key" << endl;
+               // search first free key
+               
+               lmax = n->getLeft()->maxkey;
+               lsub = n->getLeft()->subtreeSize;
+               
+
+               if (lmax == 0) { // no left child
+                       if (smallerkeys+1 < n->key) { // check if it is free
+                               newNode= new HandleNode(getNil(), smallerkeys+1);
+                               newNode->setParent(n);
+                               n->setLeft(newNode);
+                               updateCountersOnPathToRoot(n);
+                               rbInsertFixup(newNode, callHandleUpdateCounters);
+                               return newNode;
+                       }
+               } else { //left child exists
+                       if ( lmax > (lsub + smallerkeys) ) { 
+                               // free key at left subtree
+                               n=n->getLeft();
+                               continue;
+                       } else if (lmax + 1 < n->key) { // full left subtree, check if it is free inbetween
+                               // insert to predecessor
+                               n=(HandleNode*)treePredecessor(n);
+                               //found place ->  :predeccessor new key = lmax + 1
+                               newNode= new HandleNode(getNil(), lmax+1);
+                               newNode->setParent(n);
+                               n->setRight(newNode);
+                               updateCountersOnPathToRoot(n);
+                               rbInsertFixup(newNode, callHandleUpdateCounters);
+                               return newNode;
+       
+                       }
+               }
+               
+               smallerkeys += 1+lsub;
+
+               if (n->getRight() == getNil()) { // no right child, must be free
+                       newNode= new HandleNode(getNil(), smallerkeys+1);
+                       newNode->setParent(n);
+                       n->setRight(newNode);
+                       updateCountersOnPathToRoot(n);
+                       rbInsertFixup(newNode, callHandleUpdateCounters);
+                       return newNode;
+               } else { //right child exists
+                       
+                       //n=n->getRight();
+                       if (n->getRight()->maxkey-smallerkeys == n->getRight()->subtreeSize) { // right subTree is full, insert in front
+                               ulong leftMinKey = n->getRight()->maxkey - n->getRight()->subtreeSize;
+                               if (leftMinKey -1 >  n->key) { //in front there is space
+                                       //insert new n-key + 1
+                                       newNode= new HandleNode(getNil(), n->key+1);
+                                       n=(HandleNode*)treeSuccessor(n);
+                                       newNode->setParent(n);
+                                       n->setLeft(newNode);
+                                       updateCountersOnPathToRoot(n);
+                                       rbInsertFixup(newNode, callHandleUpdateCounters);
+                                       return newNode;
+                               } else {
+                                       cerr << "error: Handle::getNewKey: no space ?!? " << endl;
+                                       exit(EXIT_FAILURE);
+                               }
+                       } else {
+                               n=n->getRight();
+                       }
+
+
+               }
+       
+               #ifndef NDEBUG  
+               if (n==getNil()) {
+                       cerr << "error: Handle::getNewKey: (A) something wrong ! " << endl;
+                       exit(EXIT_FAILURE);
+               }
+               #endif
+       }
+       #ifndef NDEBUG
+       cerr << "error: Handle::getNewKey: (B) something wrong ! " << endl;
+       exit(EXIT_FAILURE);
+       #endif
+       return 0; // error
+}
+
+void callHandleUpdateCounters(RBNode *n, RBTree *T){
+       ((Handle *)T)->updateCounters((HandleNode*) n);
+}
+
+void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){
+       ((Handle *)T)->updateCountersOnPathToRoot((HandleNode*) n);
+}
+
diff --git a/handle.h b/handle.h
new file mode 100644 (file)
index 0000000..dbe2fc4
--- /dev/null
+++ b/handle.h
@@ -0,0 +1,134 @@
+
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+#ifndef Handle
+#define Handle Handle
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+  
+#include "rbtree.h"
+#include "pos.h"
+
+
+class Pos;
+class PosNode;
+
+void callHandleUpdateCounters(RBNode *n, RBTree *T);
+void callHandleUpdateCountersOnPathToRoot(RBNode *n, RBTree *T);
+
+class HandleNode : public RBNode {
+       public:
+       ulong key; //maximum for internal nodes
+       ulong subtreeSize;
+       ulong maxkey;
+       PosNode *posNode;
+
+       HandleNode() 
+               : RBNode(this), key(0), subtreeSize(0), maxkey(0){
+       }
+
+
+       HandleNode(HandleNode *n, ulong key) 
+               : RBNode(n), key(key), subtreeSize(1), maxkey(key){
+       }
+       
+       
+       HandleNode* getParent(){
+               return ((HandleNode*) ((RBNode*) this)->parent);
+       }
+
+       HandleNode* getLeft(){
+               return ((HandleNode*) ((RBNode*) this)->left);
+       }
+
+       HandleNode* getRight(){
+               return ((HandleNode*) ((RBNode*) this)->right);
+       }
+
+       void setParent(HandleNode* n){
+               ((RBNode*) this)->parent=(RBNode*)n;
+       }
+
+       void setLeft(HandleNode* n){
+               ((RBNode*) this)->left=(RBNode*)n;
+       }
+
+       void setRight(HandleNode* n){
+               ((RBNode*) this)->right=(RBNode*)n;
+       }
+       
+};
+
+class Handle : public RBTree{
+       public:
+       Pos *pos;
+
+       Handle(){
+               setNil(new HandleNode());
+               setRoot(getNil());
+       }
+       
+
+       void setRoot(HandleNode* n){
+               ((RBTree*) this)->root=(RBNode*)n;
+       }
+       
+       HandleNode* getRoot(){
+               return ((HandleNode*) ((RBTree*) this)->root);
+       }
+
+       void setNil(HandleNode* n){
+               ((RBTree*) this)->nil=(RBNode*)n;
+       }
+
+       HandleNode* getNil(){
+               return ((HandleNode*) ((RBTree*) this)->nil);
+       }
+
+
+       ulong getSize(){
+               return (getRoot()!=getNil())?getRoot()->subtreeSize:0;
+       }
+       
+       ulong* getKeys();
+       
+       ulong getPos(ulong key);
+        ulong* getEndPositions();
+
+       HandleNode* getKey(ulong key);
+       void deleteKey(ulong key);
+       void updateCountersOnPathToRoot(HandleNode *n);
+       void updateCounters(HandleNode *n);
+               
+       HandleNode* getNewKey();
+       
+       
+       
+};
+
+#endif
diff --git a/makefile b/makefile
new file mode 100644 (file)
index 0000000..0d90342
--- /dev/null
+++ b/makefile
@@ -0,0 +1,15 @@
+CC = g++
+CPPFLAGS = -Wall -ansi -pedantic -g
+
+testTextCollection_obs = testTextCollection.o TextCollection.o CSA.o Tools.o BitRank.o bittree.o handle.o pos.o rbtree.o dynFMI.o
+
+testTextCollection: $(testTextCollection_obs)
+       $(CC) -o testTextCollection $(testTextCollection_obs)
+
+clean:
+       rm -f core *.o *~ testTextCollection
+
+depend:
+       $(CC) -MM *.cpp *.c > dependencies.mk
+
+include dependencies.mk
diff --git a/pos.cpp b/pos.cpp
new file mode 100644 (file)
index 0000000..45f2b15
--- /dev/null
+++ b/pos.cpp
@@ -0,0 +1,170 @@
+
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modquity  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <iostream>
+#include <cstdlib>
+
+#include "pos.h"
+
+using std::cerr;
+using std::endl;
+
+ulong Pos::getPos(PosNode *n){
+       ulong position=0;
+       
+       if (n->getLeft() != getNil()) position += n->getLeft()->subtreeSize;
+       
+       while (n->getParent() != getNil()) {
+               if (n == n->getParent()->getRight()) {
+                       // rightchild
+                       if (n->getParent()->getLeft() != getNil()) position += n->getParent()->getLeft()->subtreeSize;
+                       position++;
+               } 
+               
+               n=n->getParent();
+       }
+       
+       
+       return position+1;
+}
+
+ulong Pos::getTextSize(ulong pos){
+       PosNode *n= getPosNode(pos);
+       
+       return n->textSize;
+}
+
+void Pos::deleteText(ulong pos){
+       PosNode *n= getPosNode(pos);
+       
+       // current node matches.
+       rbDelete(n, callPosUpdateCountersOnPathToRoot);
+       delete n;
+}
+
+void Pos::deletePosNode(PosNode *n){
+       rbDelete(n, callPosUpdateCountersOnPathToRoot);
+       delete n;
+}
+
+PosNode* Pos::getPosNode(ulong pos){
+       // smallest pos is 1 !
+       pos--;
+       PosNode *n= getRoot();
+       ulong leftTree=0;
+       ulong leftSubTreeSize;
+       while (true) {
+               leftSubTreeSize = n->getLeft()->subtreeSize;
+               
+               if (pos == leftTree + leftSubTreeSize ) {
+                       // current node matches.
+                       return n;
+               } else if (pos < leftTree + leftSubTreeSize){
+                       n=n->getLeft();
+                       }
+               else {
+                       leftTree += leftSubTreeSize +1;
+                       n=n->getRight();
+                       }
+       }
+       
+       
+       cerr << "error: Pos::getPosNode: text POS " << pos << " not found!" << endl;
+       exit(EXIT_FAILURE);
+                       
+       return  getNil();
+}
+
+
+ulong Pos::appendText(ulong textSize){
+       PosNode *n= getRoot();
+       
+       if (n == getNil()) {
+               PosNode *newLeaf= new PosNode(getNil(), textSize);
+
+               setRoot(newLeaf);
+               ((RBNode*)newLeaf)->color = BLACK;
+               
+               HandleNode* hn = handle->getNewKey();
+       
+               // connect handleNode and posNode:
+               hn->posNode = newLeaf;
+               newLeaf->handleNode = hn;
+       
+               return hn->key;
+       }
+       
+       while (n->getRight() != getNil()) {
+               n=n->getRight();
+       }
+
+       // new right child !
+       PosNode *newLeaf= new PosNode(getNil(), textSize);
+
+
+       n->setRight(newLeaf);
+       newLeaf->setParent(n);
+       
+       updateCountersOnPathToRoot(newLeaf);
+
+       rbInsertFixup(newLeaf, callPosUpdateCounters);
+
+       HandleNode* hn = handle->getNewKey();
+       
+       // connect handleNode and posNode:
+       hn->posNode = newLeaf;
+       newLeaf->handleNode = hn;
+       
+       return hn->key;
+}
+
+
+
+
+void Pos::updateCountersOnPathToRoot(PosNode *n) {
+       while (n != getNil()) {
+               updateCounters(n);
+               
+               n = n->getParent();
+       }
+}
+
+void Pos::updateCounters(PosNode *n) {
+       n->subtreeSize=1;
+       //n->sumTextSize = n->textSize;
+       
+       n->subtreeSize += n->getLeft()->subtreeSize;
+       //n->sumTextSize += n->getLeft()->sumTextSize;
+       
+       n->subtreeSize += n->getRight()->subtreeSize;
+       //n->sumTextSize += n->getRight()->sumTextSize;
+}
+
+
+
+void callPosUpdateCounters(RBNode *n, RBTree *T){
+       ((Pos *)T)->updateCounters((PosNode*) n);
+}
+
+void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T){
+       ((Pos *)T)->updateCountersOnPathToRoot((PosNode*) n);
+}
+
diff --git a/pos.h b/pos.h
new file mode 100644 (file)
index 0000000..4fcdbb7
--- /dev/null
+++ b/pos.h
@@ -0,0 +1,139 @@
+
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+ //TODO check star color of the nodes pos and handle!
+
+#ifndef Pos
+#define Pos Pos
+
+#ifndef uchar
+#define uchar unsigned char
+#endif
+#ifndef ulong
+#define ulong unsigned long
+#endif
+  
+#include "rbtree.h"
+#include "handle.h"
+
+class Handle;
+class HandleNode;
+
+void callPosUpdateCounters(RBNode *n, RBTree *T);
+void callPosUpdateCountersOnPathToRoot(RBNode *n, RBTree *T);
+class PosNode : public RBNode {
+       public:
+       
+       HandleNode *handleNode;
+       ulong subtreeSize;
+       ulong textSize; // size including endmarker!
+       //ulong sumTextSize; // sum of textlength of all texts located in this subtree;
+       
+       PosNode() // NIL
+               : RBNode(this),subtreeSize(0),textSize(0) { 
+       }
+
+       PosNode(PosNode *n, ulong textSize)
+               : RBNode(n),subtreeSize(1),textSize(textSize) { 
+       }
+       
+               
+       PosNode* getParent(){
+               return ((PosNode*) ((RBNode*) this)->parent);
+       }
+
+       PosNode* getLeft(){
+               return ((PosNode*) ((RBNode*) this)->left);
+       }
+
+       PosNode* getRight(){
+               return ((PosNode*) ((RBNode*) this)->right);
+       }
+
+       void setParent(PosNode* n){
+               ((RBNode*) this)->parent=(RBNode*)n;
+       }
+
+       void setLeft(PosNode* n){
+               ((RBNode*) this)->left=(RBNode*)n;
+       }
+
+       void setRight(PosNode* n){
+               ((RBNode*) this)->right=(RBNode*)n;
+       }       
+};
+
+class Pos: public RBTree
+{
+       public:
+       
+       Handle* handle;
+       
+       ulong textNumber;
+       ulong matchPosition;
+
+       ulong sampleInterval;
+
+       Pos(ulong sampleInterval){
+               setNil(new PosNode());
+               setRoot(getNil());
+               this->sampleInterval=sampleInterval;
+       }
+
+       void setRoot(PosNode* n){
+               ((RBTree*) this)->root=(RBNode*)n;
+       }
+       
+       PosNode* getRoot(){
+               return ((PosNode*) ((RBTree*) this)->root);
+       }
+
+       void setNil(PosNode* n){
+               ((RBTree*) this)->nil=(RBNode*)n;
+       }
+
+       PosNode* getNil(){
+               return ((PosNode*) ((RBTree*) this)->nil);
+       }
+
+       ulong getSize(){
+               return (getRoot()!=getNil())?getRoot()->subtreeSize:0;
+       }
+
+
+       
+       ulong getPos(PosNode *n);
+
+
+       PosNode* getPosNode(ulong text);
+       ulong getTextSize(ulong pos);
+       void deleteText(ulong pos);
+       void deletePosNode(PosNode *n);
+       ulong appendText(ulong textSize);
+
+       void updateCountersOnPathToRoot(PosNode *n);
+       void updateCounters(PosNode *n);
+       
+};
+
+#endif
diff --git a/rbtree.cpp b/rbtree.cpp
new file mode 100644 (file)
index 0000000..09326c1
--- /dev/null
@@ -0,0 +1,454 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+#include <iostream>
+#include <cstdlib>
+#include "rbtree.h"
+
+
+using std::cout;
+using std::endl;
+using std::cerr;
+
+
+void RBTree::checkTree(){
+       cout << "start" << endl;
+       this->root->countBlack(0);
+       cout << "stop" << endl;
+}
+
+void RBTree::leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){
+       RBNode* y = x->right;
+       x->right=y->left;
+       if (y->left != nil) y->left->parent=x;
+       y->parent=x->parent;
+       if (x->parent==this->nil)
+               root=y;
+       else {
+               if (x == x->parent->left)
+                       x->parent->left=y;
+               else
+               x->parent->right=y;
+               }
+       y->left=x;
+       x->parent=y;
+
+       // update counters of x and y !
+       if (updateNode != 0) {
+               updateNode(x, this);
+               updateNode(y, this);
+               }
+}
+
+void RBTree::rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T)){
+       RBNode* y = x->left;
+       x->left=y->right;
+       if (y->right != nil) y->right->parent=x;
+       y->parent=x->parent;
+
+
+       if (x->parent==nil)
+               root=y;
+       else {
+               if (x == x->parent->right) {
+                       x->parent->right=y;
+               } else {
+                       x->parent->left=y;
+               } 
+       }
+
+       y->right=x;
+       x->parent=y;
+       if (updateNode != 0) {
+               updateNode(x, this);
+               updateNode(y, this);
+               }
+}
+
+void RBTree::rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T)){
+       RBNode *y;
+       
+       if (z->parent==nil) return;
+       while (z->parent->color == RED) {
+               if (z->parent == z->parent->parent->left)
+               {
+                       y = z->parent->parent->right;
+                       
+                       if (y->color==RED)
+                       {
+                               z->parent->color=BLACK;        // Case 1a
+                               y->color=BLACK;                // Case 1a
+                               z->parent->parent->color=RED;  // Case 1a
+                               z=z->parent->parent;           // Case 1a
+                       }
+                       else
+                       {
+                               if (z==z->parent->right)
+                               {
+                                       z=z->parent;      // Case 2a
+                                       leftRotate(z, updateNode);  // Case 2a
+                               }
+                               z->parent->color=BLACK;             // Case 3a
+                               z->parent->parent->color=RED;       // Case 3a
+                               rightRotate(z->parent->parent, updateNode);  // Case 3a
+                       }
+               }
+               else
+               {
+
+                       y = z->parent->parent->left;
+
+                       if (y->color==RED)
+                       {
+                               z->parent->color=BLACK;        // Case 1b
+                               y->color=BLACK;                // Case 1b
+                               z->parent->parent->color=RED;  // Case 1b
+                               z=z->parent->parent;           // Case 1b
+                       }
+                       else
+                       {
+                               if (z==z->parent->left)
+                               {
+                                       z=z->parent;      // Case 2b
+                                       rightRotate(z, updateNode);  // Case 2b
+                               }
+                               z->parent->color=BLACK;             // Case 3b
+                               z->parent->parent->color=RED;       // Case 3b
+                               leftRotate(z->parent->parent, updateNode);  // Case 3b
+                       }
+               }
+               if (z->parent==nil) return;
+       } //end of while
+
+       root->color=BLACK;
+}
+
+
+void RBTree::rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T)){
+       RBNode *w;
+
+       while ((x != root) && (x->color == BLACK))
+       {
+               if (x == x->parent->left)
+               {
+                       w=x->parent->right;
+                       
+                       if (w->color == RED)
+                       {
+                               w->color=BLACK;           // Case 1a
+                               x->parent->color=RED;     // Case 1a
+                               leftRotate(x->parent, updateNode);  // Case 1a
+                               w=x->parent->right;       // Case 1a
+                       }
+                       if ((w->left->color == BLACK) && (w->right->color == BLACK))
+                       {
+                               w->color=RED;  // Case 2a
+                               x=x->parent;   // Case 2a
+                       }
+                       else
+                       {
+                               if (w->right->color == BLACK)
+                               {
+                                       w->left->color=BLACK;  // Case 3a
+                                       w->color=RED;          // Case 3a
+                                       rightRotate(w, updateNode);      // Case 3a
+                                       w=x->parent->right;    // Case 3a
+                               }
+                               w->color=x->parent->color;  // Case 4a
+                               x->parent->color=BLACK;     // Case 4a
+                               w->right->color=BLACK;      // Case 4a
+                               leftRotate(x->parent, updateNode);    // Case 4a
+                               x=root;                  // Case 4a
+                       }
+               }
+               else
+               {
+                       w=x->parent->left;
+
+                       if (w->color == RED)
+                       {
+                               w->color=BLACK;           // Case 1b
+                               x->parent->color=RED;     // Case 1b
+                               rightRotate(x->parent, updateNode);  // Case 1b
+                               w=x->parent->left;       // Case 1b
+                       }
+                       if ((w->right->color == BLACK) && (w->left->color == BLACK))
+                       {
+                               w->color=RED;  // Case 2b
+                               x=x->parent;   // Case 2b
+                       }
+                       else
+                       {
+                               if (w->left->color == BLACK)
+                               {
+                                       w->right->color=BLACK;  // Case 3b
+                                       w->color=RED;          // Case 3b
+                                       leftRotate(w, updateNode);      // Case 3b
+                                       w=x->parent->left;    // Case 3b
+                               }
+
+                               w->color=x->parent->color;  // Case 4b
+                               x->parent->color=BLACK;     // Case 4b
+                               w->left->color=BLACK;      // Case 4b
+                               rightRotate(x->parent, updateNode);    // Case 4b
+
+                               x=root;                  // Case 4b
+                       }
+               }
+       } // end of while
+       x->color=BLACK;
+}
+
+// quite similar to Cormen et al
+void RBTree::rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T)){
+       RBNode *y,*x,*y_oldParent;
+       y_oldParent=nil;
+       if (z->left == nil || z->right == nil) {
+               y = z;  //z has no or one child, deletion is easy
+       } else {
+               y = treeSuccessor(z);
+               y_oldParent=y->parent;
+       }
+
+       if (y->left != nil) {
+               x=y->left;
+       } else {
+               x=y->right;
+       }
+       
+       x->parent=y->parent;
+
+       // cut out y:
+       if (y->parent == nil) {
+               root=x;
+       } else {
+               if (isLeftChild(y)) {y->parent->left = x;}
+                       else {y->parent->right = x;}
+       }
+       
+       RBNodecolor old_y = y->color;
+       if (y != z) { // 2 children
+               //move y to z's old position and delete z
+               if (root==z) {
+                       root=y;
+               } else {
+                       if (isLeftChild(z)) z->parent->left = y;
+                       else z->parent->right = y;
+               }
+                       
+               y->parent = z->parent;
+               y->left = z->left;
+               y->right = z->right;
+               y->color = z->color;  // don't forget to delete z after rbDelete
+
+               y->left->parent = y;
+               y->right->parent = y;
+               
+       }
+
+
+       if (old_y == BLACK) {
+               rbDeleteFixup(x, updateNode);
+       }
+       
+       updateNode(y, this);
+       if (y_oldParent!=nil) updateNode(y_oldParent, this);
+
+}
+
+
+RBNode* RBTree::treeSuccessor(RBNode *x){
+       if (x->right != nil) return treeMinimum(x->right);
+
+       RBNode *y = x->parent;
+       while ((y!=nil) && (x == y->right)) {
+               x=y;
+               y=y->parent;
+       }
+       return y;
+}
+
+RBNode* RBTree::treePredecessor(RBNode *x){
+       if (x->left != nil) return treeMaximum(x->left);
+
+       RBNode *y = x->parent;
+       while ((y!=nil) && (x == y->left)) {
+               x=y;
+               y=y->parent;
+       }
+       return y;
+}
+
+
+RBNode* RBTree::treeMinimum(RBNode *x){
+       while (x->left != nil) x=x->left;
+       return x;
+}
+
+RBNode* RBTree::treeMaximum(RBNode *x){
+       while (x->right != nil) x=x->right;
+       return x;
+}
+
+
+bool RBTree::isLeftChild(RBNode *n){
+       #ifndef NDEBUG
+       if (n->parent == nil) {
+               cerr << "error: isLeftChild, no parent." << endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       return (n->parent->left == n);
+}
+
+bool RBTree::isRightChild(RBNode *n){
+       #ifndef NDEBUG
+       if ( n->parent == nil) {
+               cerr << "error: isLeftChild, no parent." << endl;
+               exit(EXIT_FAILURE);
+       }
+       #endif
+       return (n->parent->right == n);
+}
+
+
+RBNode* RBTree::findRightSiblingLeaf(RBNode *n){
+       // go up:
+       while (true) {
+               if (n->parent!=nil) {
+                       if (n==n->parent->right)
+                         n=n->parent;
+                       else
+                         break;
+               } else
+                       return nil;
+       }
+               
+       n=n->parent;
+
+       // leftmost leaf in n is the right sibling searched
+       n=n->right;
+
+       // go down:
+       while (n->left != nil) {
+               n=n->left;
+       }
+
+       return n;
+}
+
+RBNode* RBTree::findLeftSiblingLeaf(RBNode *n){
+       // go up:
+       while (true) {
+               if (n->parent!=nil) {
+                       if (n==n->parent->left)
+                               n=n->parent;
+                       else
+                               break;
+               } else
+                       return nil;
+       }
+       
+       n=n->parent;
+       
+       // rightmost leaf in n is the left sibling searched
+       n=n->left;
+
+       // go down:
+       while (n->right != nil) {
+               n=n->right;
+       }
+
+       return n;
+}
+
+
+int RBTree::getNodeMaxDepth(RBNode *n) {
+       int l;
+       int r;
+       if (n->left == nil) l=0;
+       else l=getNodeMaxDepth(n->left);
+       if (n->right == nil) r=0;
+       else r=getNodeMaxDepth(n->right);
+
+       return (1+(l>r?l:r));
+}
+
+int RBTree::getNodeMinDepth(RBNode *n) {
+       int l;
+       int r;
+       if (n->left == 0) l=0;
+       else l=getNodeMinDepth(n->left);
+       if (n->right == 0) r=0;
+       else r=getNodeMinDepth(n->right);
+
+       return (1+(l>r?r:l));
+}
+
+
+void RBTree::printSubTree(RBNode *n){
+       cout << ((n->color==BLACK)?"B":"R") << n << " [";
+       if (n->left==nil) cout << "N";
+               else printSubTree(n->left);
+       cout << ",";
+       if (n->right==nil) cout << "N";
+               else printSubTree(n->right);
+       cout << "]";
+}
+
+
+void RBTree::checkSubTree(RBNode *n){
+
+       if (n->left!=nil) {
+               if (n->left->parent != n) {
+                       cout << "au"<< endl;
+                       exit(1);
+               }
+               checkSubTree(n->left);
+       }
+
+       if (n->right!=nil) {
+               if (n->right->parent != n) {
+                       cout << "au"<< endl;
+                       exit(1);
+               }
+               checkSubTree(n->right);
+       }
+
+}
+
+
+void RBTree::checkNode(RBNode *n){
+       if (n->left!=nil) {
+               if (n->left->parent != n) {
+                       cout << "au"<< endl;
+                       exit(1);
+               }
+               
+       }
+
+       if (n->right!=nil) {
+               if (n->right->parent != n) {
+                       cout << "au"<< endl;
+                       exit(1);
+               }
+       }       
+}
diff --git a/rbtree.h b/rbtree.h
new file mode 100644 (file)
index 0000000..b59c0a3
--- /dev/null
+++ b/rbtree.h
@@ -0,0 +1,116 @@
+/***************************************************************************
+ *   Copyright (C) 2006 by Wolfgang Gerlach   *
+ *   No object matches key 'wgerlach'.   *
+ *                                                                         *
+ *   This program is free software; you can redistribute it and/or modify  *
+ *   it under the terms of the GNU General Public License as published by  *
+ *   the Free Software Foundation; either version 2 of the License, or     *
+ *   (at your option) any later version.                                   *
+ *                                                                         *
+ *   This program is distributed in the hope that it will be useful,       *
+ *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
+ *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
+ *   GNU General Public License for more details.                          *
+ *                                                                         *
+ *   You should have received a copy of the GNU General Public License     *
+ *   along with this program; if not, write to the                         *
+ *   Free Software Foundation, Inc.,                                       *
+ *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
+ ***************************************************************************/
+
+
+// this is a red-black tree implementation by wolfgang Gerlach based on the algorithm provided by 
+// Cormen et al.: Introduction to Algorithms, Second Edition. MIT Press and McGraw-Hill, 2001
+
+
+
+#ifndef RBTree
+#define RBTree RBTree
+
+
+
+enum RBNodecolor{BLACK,RED};
+
+
+// generic Red-Black Tree Node:
+class RBNode
+{
+       public:
+       RBNode* parent;
+       RBNode* left;
+       RBNode* right;   // 3*4 bytes
+       
+       enum RBNodecolor color; // due to structure alignment: 4 bytes !!!
+
+       RBNode(){};
+
+       RBNode(RBNode *n)
+               : parent(n), left(n), right(n){
+               color=RED;
+       }
+
+
+       virtual ~RBNode(){} //adds 4 bytes vtable
+
+       void countBlack(int i){
+            if (this->color == BLACK) i++;
+            if (this->left != this) this->left->countBlack(i);
+            else std::cout << i << ",";
+            if (this->right != this) this->right->countBlack(i);
+            else std::cout << i << ",";
+       }
+};
+
+class RBTree{
+       public:
+
+       RBNode *root;
+       RBNode *nil;
+       
+       
+       virtual ~RBTree(){
+            // Don't double free!
+            if (root != this->nil)
+                deleteNode(root);
+            root = 0;
+            delete this->nil;
+            this->nil = 0;
+       }
+
+       void checkTree();
+
+       void rbInsertFixup(RBNode* z, void (*updateNode)(RBNode* n, RBTree *T));
+       void rbDeleteFixup(RBNode *x, void (*updateNode)(RBNode* n, RBTree *T));
+       void rbDelete(RBNode *z, void (*updateNode)(RBNode* n, RBTree *T));
+       RBNode* findRightSiblingLeaf(RBNode *n);
+       RBNode* findLeftSiblingLeaf(RBNode *n);
+       RBNode* treeSuccessor(RBNode *x);
+       RBNode* treePredecessor(RBNode *x);
+       RBNode* treeMinimum(RBNode *x);
+       RBNode* treeMaximum(RBNode *x);
+       
+       bool isLeftChild(RBNode *n);
+       bool isRightChild(RBNode *n);
+       
+       int getNodeMaxDepth(RBNode *n);
+       int getNodeMinDepth(RBNode *n);
+       
+       void printSubTree(RBNode *n);
+       void checkSubTree(RBNode *n);
+       void checkNode(RBNode *x);
+
+       void deleteNode(RBNode* x){
+               if (x->left!=nil && x->left) deleteNode(x->left);
+               if (x->right!=nil && x->right) deleteNode(x->right);
+               delete x;
+       }
+
+
+       private:
+       void leftRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T));
+       void rightRotate(RBNode* x, void (*updateNode)(RBNode* n, RBTree *T));
+
+};
+
+#endif
diff --git a/testTextCollection.cpp b/testTextCollection.cpp
new file mode 100644 (file)
index 0000000..d005b8f
--- /dev/null
@@ -0,0 +1,69 @@
+// Test driver for text collection
+#include <iostream>
+using std::cout;
+using std::endl;
+
+#include "TextCollection.h"
+using SXSI::TextCollection;
+
+void printDocumentResult(TextCollection::document_result dr)
+{
+    TextCollection::document_result::iterator it;
+    printf("document result:");
+    for (it = dr.begin(); it != dr.end(); ++it)
+        printf(" %i", *it);
+    printf("\n");
+    fflush(stdout);
+}
+
+void printFullResult(TextCollection::full_result fr)
+{
+    TextCollection::full_result::iterator it;
+    printf("full result:");
+    for (it = fr.begin(); it != fr.end(); ++it)
+        printf(" (%i, %lu)", (*it).first, (*it).second);
+    printf("\n");
+    fflush(stdout);
+}
+
+int main()
+{
+    uchar *text = (uchar*) "acabab";
+    TextCollection *csa = TextCollection::InitTextCollection(1);
+    csa->InsertText(text);
+    text = (uchar*) "abaca";
+    csa->InsertText(text);
+    text = (uchar*) "abacb";
+    csa->InsertText(text);
+
+    csa->MakeStatic();
+    
+    text = csa->GetText(0);
+    cout << "Text 0: \"" << text << "\"" << endl;
+    delete [] text;
+    text = csa->GetText(1);
+    cout << "Text 1: \"" << text << "\"" << endl;
+    delete [] text;
+    text = csa->GetText(2);
+    cout << "Text 2: \"" << text << "\"" << endl;
+    delete [] text;
+    
+    text = csa->GetText(2, 2, 4);
+    cout << "Substring of Text 3: \"" << text << "\"" << endl;
+    delete [] text;
+    
+    printf("n:o contains: %u\n", csa->CountContains((uchar *)"ac"));
+    printf("n:o suffix: %u\n", csa->CountSuffix((uchar *)"b"));
+    printf("n:o equal: %u\n", csa->CountEqual((uchar *)"acabab"));
+    printf("is equal: %u\n", csa->IsEqual((uchar *)"abacb"));
+    
+    TextCollection::document_result dr;
+    dr = csa->Contains((uchar*)"ab");
+    printDocumentResult(dr);
+
+    TextCollection::full_result fr;
+    fr = csa->FullContains((uchar *)"ab");
+    printFullResult(fr);
+
+    delete csa;
+}