CSA.h

   1 /******************************************************************************
   2  *   Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki                *
   3  *                                                                            *
   4  *                                                                            *
   5  *   This program is free software; you can redistribute it and/or modify     *
   6  *   it under the terms of the GNU Lesser General Public License as published *
   7  *   by the Free Software Foundation; either version 2 of the License, or     *
   8  *   (at your option) any later version.                                      *
   9  *                                                                            *
  10  *   This program is distributed in the hope that it will be useful,          *
  11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of           *
  12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the            *
  13  *   GNU Lesser General Public License for more details.                      *
  14  *                                                                            *
  15  *   You should have received a copy of the GNU Lesser General Public License *
  16  *   along with this program; if not, write to the                            *
  17  *   Free Software Foundation, Inc.,                                          *
  18  *   51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA.            *
  19  *****************************************************************************/
  20
  21 #ifndef _CSA_H_
  22 #define _CSA_H_
  23 #include "dynFMI.h"
  24
  25 // Include  from XMLTree/libcds
  26 #include <basics.h>
  27 #include <static_bitsequence.h>
  28 #include <alphabet_mapper.h>
  29 #include <static_sequence.h>
  30
  31 #include <set>
  32 #include <vector>
  33 #include "BitRank.h"
  34 #include "TextCollection.h"
  35 #include "BlockArray.h"
  36 #include "RLWaveletTree.h"
  37
  38 /**
  39  * Implementation of the TextCollection interface
  40  *
  41  * Implementation notes:
  42  *
  43  * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
  44  * which requires O(n log \sigma) bits of memory. If we know the distribution
  45  * of characters beforehand, space can easily be made O(nH_0).
  46  *
  47  * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
  48  * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
  49  *
  50  * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
  51  * with any FM-index variant supporting LF-mapping.
  52  */
  53 class CSA : public SXSI::TextCollection {
  54 public:
  55     /**
  56      * Constructor with (default) samplerate
  57      */
  58     explicit CSA(unsigned);
  59     ~CSA();
  60     /**
  61      * Following functions implement the interface described in TextCollection.h.
  62      * For details/documentation, see TextCollection.h.
  63      */
  64     void InsertText(uchar const *);
  65     void MakeStatic();
  66     bool EmptyText(DocId) const;
  67     uchar* GetText(DocId) const;
  68 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
  69
  70     bool IsPrefix(uchar const *) const;
  71     bool IsSuffix(uchar const *) const;
  72     bool IsEqual(uchar const *) const;
  73     bool IsContains(uchar const *) const;
  74     bool IsLessThan(uchar const *) const;
  75
  76     unsigned CountPrefix(uchar const *) const;
  77     unsigned CountSuffix(uchar const *) const;
  78     unsigned CountEqual(uchar const *) const;
  79     unsigned CountContains(uchar const *) const;
  80     unsigned CountLessThan(const unsigned char*) const;
  81
  82     // document_result is inherited from SXSI::TextCollection.
  83     document_result Prefix(uchar const *) const;
  84     document_result Suffix(uchar const *) const;
  85     document_result Equal(uchar const *) const;
  86     document_result Contains(uchar const *) const;
  87     document_result LessThan(uchar const *) const;
  88
  89     // full_result is inherited from SXSI::TextCollection.
  90     full_result FullContains(uchar const *) const;
  91
  92     // Index from/to disk
  93     void Load(FILE *, unsigned);
  94     void Save(FILE *) const;
  95
  96
  97     // Debug FIXME Remove
  98     void deleteWT()
  99     {
 100         delete alphabetrank;
 101         alphabetrank = 0;
 102         delete [] codetable;
 103         codetable = 0;
 104     }
 105     void deleteSamples()
 106     {
 107         delete sampled;
 108         sampled =0;
 109         delete suffixes;
 110         suffixes = 0;
 111         delete positions;
 112         positions = 0;
 113         delete suffixDocId;
 114         suffixDocId = 0;
 115     }
 116     void deleteEndmarker()
 117     {
 118         delete endmarkerDocId;
 119         endmarkerDocId = 0;
 120     }
 121
 122 private:
 123     class TCodeEntry {
 124     public:
 125         unsigned count;
 126         unsigned bits;
 127         unsigned code;
 128         TCodeEntry() {count=0;bits=0;code=0u;};
 129     };
 130
 131
 132     class THuffAlphabetRank {
 133     // using fixed 0...255 alphabet
 134     private:
 135         BitRank *bitrank;
 136         THuffAlphabetRank *left;
 137         THuffAlphabetRank *right;
 138         TCodeEntry *codetable;
 139         uchar ch;
 140         bool leaf;
 141     public:
 142         THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
 143         THuffAlphabetRank(FILE *);
 144         ~THuffAlphabetRank();
 145
 146         void Save(FILE *);
 147         bool Test(uchar *, TextPosition);
 148
 149         inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
 150             THuffAlphabetRank const * temp=this;
 151             if (codetable[c].count == 0) return 0;
 152             unsigned level = 0;
 153             unsigned code = codetable[c].code;
 154             while (!temp->leaf) {
 155                 if ((code & (1u<<level)) == 0) {
 156                 i = i-temp->bitrank->rank(i);
 157                     temp = temp->left;
 158                 }
 159                 else {
 160                     i = temp->bitrank->rank(i)-1;
 161                     temp = temp->right;
 162                 }
 163                ++level;
 164             }
 165             return i+1;
 166         };
 167         inline bool IsCharAtPos(int c, TextPosition i) const {
 168             THuffAlphabetRank const * temp=this;
 169             if (codetable[c].count == 0) return false;
 170             unsigned level = 0;
 171             unsigned code = codetable[c].code;
 172             while (!temp->leaf) {
 173                 if ((code & (1u<<level))==0) {
 174                     if (temp->bitrank->IsBitSet(i)) return false;
 175                     i = i-temp->bitrank->rank(i);
 176                     temp = temp->left;
 177                 }
 178                 else {
 179                     if (!temp->bitrank->IsBitSet(i)) return false;
 180                     i = temp->bitrank->rank(i)-1;
 181                     temp = temp->right;
 182                 }
 183                ++level;
 184             }
 185             return true;
 186         }
 187         inline uchar access(TextPosition i) const {
 188             THuffAlphabetRank const * temp=this;
 189             while (!temp->leaf) {
 190                 if (temp->bitrank->IsBitSet(i)) {
 191                 i = temp->bitrank->rank(i)-1;
 192                 temp = temp->right;
 193             }
 194             else {
 195                 i = i-temp->bitrank->rank(i);
 196                     temp = temp->left;
 197             }
 198             }
 199             return temp->ch;
 200         }
 201
 202         inline uchar charAtPos(ulong i, TextPosition *rank) const {
 203             THuffAlphabetRank const * temp=this;
 204             while (!temp->leaf) {
 205                 if (temp->bitrank->IsBitSet(i)) {
 206                     i = temp->bitrank->rank(i)-1;
 207                     temp = temp->right;
 208                 } else {
 209                     i = i-temp->bitrank->rank(i);
 210                     temp = temp->left;
 211                 }
 212             }
 213             (*rank)=i;
 214             return temp->ch;
 215         }
 216     };
 217
 218     class node {
 219     private:
 220         unsigned weight;
 221         uchar value;
 222         node *child0;
 223         node *child1;
 224
 225         void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const;
 226         static void count_chars(uchar *, TextPosition , TCodeEntry *);
 227         static unsigned SetBit(unsigned , unsigned , unsigned );
 228     public:
 229         node( unsigned char c = 0, unsigned i = 0 ) {
 230             value = c;
 231             weight = i;
 232             child0 = 0;
 233             child1 = 0;
 234         }
 235
 236         node( node* c0, node *c1 ) {
 237             value = 0;
 238             weight = c0->weight + c1->weight;
 239             child0 = c0;
 240             child1 = c1;
 241         }
 242
 243
 244         bool operator>( const node &a ) const {
 245             return weight > a.weight;
 246         }
 247
 248         static TCodeEntry *makecodetable(uchar *, TextPosition);
 249     };
 250
 251     static const unsigned char print = 1;
 252     static const unsigned char report = 1;
 253     static const uchar versionFlag;
 254     TextPosition n;
 255     unsigned samplerate;
 256     unsigned C[256];
 257     TextPosition bwtEndPos;
 258 //    THuffAlphabetRank *alphabetrank;
 259     //    RLWaveletTree *alphabetrank;
 260     static_sequence * alphabetrank;
 261     BSGAP *sampled;
 262     BlockArray *suffixes;
 263     BlockArray *suffixDocId;
 264     BlockArray *positions;
 265     TCodeEntry *codetable;
 266     DynFMI *dynFMI;
 267
 268     // Total number of texts in the collection
 269     unsigned numberOfTexts;
 270     // Total number of texts including empty texts
 271     unsigned numberOfAllTexts;
 272     // Length of the longest text
 273     ulong maxTextLength;
 274
 275     // Array of document id's in the order of end-markers in BWT
 276     // Access by endmarkerDocId[rank_$(L, p) - 1].
 277     BlockArray *endmarkerDocId;
 278     // Array of text lengths (in the inserted order)
 279     BlockArray *textLength;
 280     // Array of text starting positions (in the inserted order)
 281     BlockArray *textStartPos;
 282
 283     // FIXME Replace with a more succinct data structure
 284     std::set<unsigned> emptyTextId;
 285     BSGAP *emptyTextRank;
 286
 287     // Private methods
 288     uchar * BWT(uchar *);
 289     uchar * LoadFromFile(const char *);
 290     void SaveToFile(const char *, uchar *);
 291     void makewavelet(uchar *);
 292     void maketables();
 293
 294     // Following are not part of the public API
 295     DocId DocIdAtTextPos(TextPosition) const;
 296     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
 297     TextPosition Inverse(TextPosition) const;
 298     TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
 299     TextPosition Psi(TextPosition) const;
 300     uchar * Substring(TextPosition, TextPosition) const;
 301     TextPosition Lookup(TextPosition) const;
 302
 303     /**
 304      * Count end-markers in given interval
 305      */
 306     inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
 307     {
 308         if (sp > ep)
 309             return 0;
 310
 311         ulong ranksp = 0;
 312         if (sp != 0)
 313             ranksp = alphabetrank->rank(0, sp - 1);
 314
 315         return alphabetrank->rank(0, ep) - ranksp;
 316     }
 317 };
 318
 319 #endif