CSA.h

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20
  21 #ifndef _CSA_H_
  22 #define _CSA_H_
  23 #include "dynFMI.h"
  24 #include "BitRank.h"
  25 #include "TextCollection.h"
  26 #include "BlockArray.h"
  27 #include "RLWaveletTree.h"
  28 #include <set>
  29 #include <vector>
  30
  31 // Include  from XMLTree/libcds
  32 #include <basics.h> // Defines W == 32
  33 #include <static_bitsequence.h>
  34 #include <alphabet_mapper.h>
  35 #include <static_sequence.h>
  36
  37 // Re-define word size to ulong:
  38 #undef W
  39 #if __WORDSIZE == 64
  40 #   define W 64
  41 #else
  42 #   define W 32
  43 #endif
  44 #undef bitset
  45
  46 // Un-comment to compare BWT against a BWT generated from class dynFMI:
  47 //#define CSA_TEST_BWT
  48
  49 /**
  50  * Implementation of the TextCollection interface
  51  *
  52  */
  53 class CSA : public SXSI::TextCollection {
  54 public:
  55     /**
  56      * Constructor with (default) samplerate
  57      */
  58     explicit CSA(unsigned);
  59     ~CSA();
  60     /**
  61      * Following functions implement the interface described in TextCollection.h.
  62      * For details/documentation, see TextCollection.h.
  63      */
  64     void InsertText(uchar const *);
  65     void MakeStatic();
  66     bool EmptyText(DocId) const;
  67     uchar* GetText(DocId) const;
  68     /**
  69      * Next method is not supported:
  70      * Supporting GetText for some substring [i,j]
  71      * would require more space.
  72      */
  73 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  74
  75     bool IsPrefix(uchar const *) const;
  76     bool IsSuffix(uchar const *) const;
  77     bool IsEqual(uchar const *) const;
  78     bool IsContains(uchar const *) const;
  79     bool IsLessThan(uchar const *) const;
  80
  81     ulong Count(uchar const *) const;
  82     unsigned CountPrefix(uchar const *) const;
  83     unsigned CountSuffix(uchar const *) const;
  84     unsigned CountEqual(uchar const *) const;
  85     unsigned CountContains(uchar const *) const;
  86     unsigned CountLessThan(const unsigned char*) const;
  87
  88     // document_result is inherited from SXSI::TextCollection.
  89     document_result Prefix(uchar const *) const;
  90     document_result Suffix(uchar const *) const;
  91     document_result Equal(uchar const *) const;
  92     document_result Contains(uchar const *) const;
  93     document_result LessThan(uchar const *) const;
  94
  95     // full_result is inherited from SXSI::TextCollection.
  96     full_result FullContains(uchar const *) const;
  97
  98     // Index from/to disk
  99     void Load(FILE *, unsigned);
 100     void Save(FILE *) const;
 101
 102
 103     // FIXME Remove
 104     void deleteWT()
 105     {
 106         delete alphabetrank;
 107         alphabetrank = 0;
 108         delete [] codetable;
 109         codetable = 0;
 110     }
 111     void deleteSamples()
 112     {
 113         delete sampled;
 114         sampled =0;
 115         delete suffixes;
 116         suffixes = 0;
 117         delete suffixDocId;
 118         suffixDocId = 0;
 119     }
 120     void deleteEndmarker()
 121     {
 122         delete endmarkerDocId;
 123         endmarkerDocId = 0;
 124     }
 125
 126 private:
 127     // FIXME Unused code
 128     class TCodeEntry {
 129     public:
 130         unsigned count;
 131         unsigned bits;
 132         unsigned code;
 133         TCodeEntry() {count=0;bits=0;code=0u;};
 134     };
 135
 136
 137     // FIXME Unused code
 138     class THuffAlphabetRank {
 139     // using fixed 0...255 alphabet
 140     private:
 141         BitRank *bitrank;
 142         THuffAlphabetRank *left;
 143         THuffAlphabetRank *right;
 144         TCodeEntry *codetable;
 145         uchar ch;
 146         bool leaf;
 147     public:
 148         THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
 149         THuffAlphabetRank(FILE *);
 150         ~THuffAlphabetRank();
 151
 152         void Save(FILE *);
 153         bool Test(uchar *, TextPosition);
 154
 155         inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
 156             THuffAlphabetRank const * temp=this;
 157             if (codetable[c].count == 0) return 0;
 158             unsigned level = 0;
 159             unsigned code = codetable[c].code;
 160             while (!temp->leaf) {
 161                 if ((code & (1u<<level)) == 0) {
 162                 i = i-temp->bitrank->rank(i);
 163                     temp = temp->left;
 164                 }
 165                 else {
 166                     i = temp->bitrank->rank(i)-1;
 167                     temp = temp->right;
 168                 }
 169                ++level;
 170             }
 171             return i+1;
 172         };
 173         inline bool IsCharAtPos(int c, TextPosition i) const {
 174             THuffAlphabetRank const * temp=this;
 175             if (codetable[c].count == 0) return false;
 176             unsigned level = 0;
 177             unsigned code = codetable[c].code;
 178             while (!temp->leaf) {
 179                 if ((code & (1u<<level))==0) {
 180                     if (temp->bitrank->IsBitSet(i)) return false;
 181                     i = i-temp->bitrank->rank(i);
 182                     temp = temp->left;
 183                 }
 184                 else {
 185                     if (!temp->bitrank->IsBitSet(i)) return false;
 186                     i = temp->bitrank->rank(i)-1;
 187                     temp = temp->right;
 188                 }
 189                ++level;
 190             }
 191             return true;
 192         }
 193         inline uchar access(TextPosition i) const {
 194             THuffAlphabetRank const * temp=this;
 195             while (!temp->leaf) {
 196                 if (temp->bitrank->IsBitSet(i)) {
 197                 i = temp->bitrank->rank(i)-1;
 198                 temp = temp->right;
 199             }
 200             else {
 201                 i = i-temp->bitrank->rank(i);
 202                     temp = temp->left;
 203             }
 204             }
 205             return temp->ch;
 206         }
 207
 208         inline uchar charAtPos(ulong i, TextPosition *rank) const {
 209             THuffAlphabetRank const * temp=this;
 210             while (!temp->leaf) {
 211                 if (temp->bitrank->IsBitSet(i)) {
 212                     i = temp->bitrank->rank(i)-1;
 213                     temp = temp->right;
 214                 } else {
 215                     i = i-temp->bitrank->rank(i);
 216                     temp = temp->left;
 217                 }
 218             }
 219             (*rank)=i;
 220             return temp->ch;
 221         }
 222     };
 223
 224     // FIXME Unused code
 225     class node {
 226     private:
 227         unsigned weight;
 228         uchar value;
 229         node *child0;
 230         node *child1;
 231
 232         void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
 233         static void count_chars(uchar *, TextPosition , TCodeEntry *);
 234         static unsigned SetBit(unsigned , unsigned , unsigned );
 235     public:
 236         node( unsigned char c = 0, unsigned i = 0 ) {
 237             value = c;
 238             weight = i;
 239             child0 = 0;
 240             child1 = 0;
 241         }
 242
 243         node( node* c0, node *c1 ) {
 244             value = 0;
 245             weight = c0->weight + c1->weight;
 246             child0 = c0;
 247             child1 = c1;
 248         }
 249
 250
 251         bool operator>( const node &a ) const {
 252             return weight > a.weight;
 253         }
 254
 255         static TCodeEntry *makecodetable(uchar *, TextPosition);
 256     };
 257
 258     static const uchar versionFlag;
 259     TextPosition n;
 260     unsigned samplerate;
 261     unsigned C[256];
 262     TextPosition bwtEndPos;
 263 //    THuffAlphabetRank *alphabetrank;
 264     //    RLWaveletTree *alphabetrank;
 265     static_sequence * alphabetrank;
 266
 267 #ifdef CSA_TEST_BWT
 268     DynFMI *dynFMI;
 269 #endif
 270     TCodeEntry *codetable;
 271
 272     // Sample structures for texts longer than samplerate
 273     BSGAP *sampled;
 274     BlockArray *suffixes;
 275     BlockArray *suffixDocId;
 276
 277     // Total number of texts in the collection
 278     unsigned numberOfTexts;
 279     // Total number of texts including empty texts
 280     unsigned numberOfAllTexts;
 281     // Length of the longest text
 282     ulong maxTextLength;
 283
 284     // Array of document id's in the order of end-markers in BWT
 285     // Access by endmarkerDocId[rank_$(L, p) - 1].
 286     BlockArray *endmarkerDocId;
 287
 288     // FIXME Replace with a more succinct data structure
 289     // Note: Empty texts are already handled inside XMLTree class.
 290     std::set<unsigned> emptyTextId;
 291     BSGAP *emptyTextRank;
 292
 293     // FIXME A better solution?
 294     std::string texts;
 295
 296     // Following are not part of the public API
 297     uchar * BWT(uchar *);
 298     void makewavelet(uchar *);
 299     void maketables();
 300     DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
 301     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 302 //    TextPosition Inverse(TextPosition) const;
 303 //    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
 304 //    TextPosition Psi(TextPosition) const;
 305 //    uchar * Substring(TextPosition, TextPosition) const;
 306 //    TextPosition Lookup(TextPosition) const;
 307
 308     /**
 309      * Count end-markers in given interval
 310      */
 311     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
 312     {
 313         if (sp > ep)
 314             return 0;
 315
 316         ulong ranksp = 0;
 317         if (sp != 0)
 318             ranksp = alphabetrank->rank(0, sp - 1);
 319
 320         return alphabetrank->rank(0, ep) - ranksp;
 321     }
 322 };
 323
 324 #endif