BWT via Difference cover

[SXSI/TextCollection.git] / CSA.h
diff --git a/CSA.h b/CSA.h

index bd7b199..c0751cf 100644 (file)
--- a/CSA.h
+++ b/CSA.h
@@ -20,28 +20,35 @@
  
  #ifndef _CSA_H_
  #define _CSA_H_
-
-#include <map>
-#include <vector>
-#include "BitRank.h"
  #include "dynFMI.h"
+#include "BitRank.h"
  #include "TextCollection.h"
  #include "BlockArray.h"
+#include "RLWaveletTree.h"
+#include <set>
+#include <vector>
+
+// Include  from XMLTree/libcds
+#include <basics.h> // Defines W == 32
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
+
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+#   define W 64
+#else
+#   define W 32
+#endif
+#undef bitset
+
+// Un-comment to compare BWT against a BWT generated from class dynFMI:
+//#define CSA_TEST_BWT
  
  /**
   * Implementation of the TextCollection interface
   *
- * Implementation notes:
- *
- * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
- * which requires O(n log \sigma) bits of memory. If we know the distribution
- * of characters beforehand, space can easily be made O(nH_0).
- * 
- * The method MakeStatic() constructs a Succinct Suffix Array with a huffman 
- * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
- *
- * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
- * with any FM-index variant supporting LF-mapping.
   */
  class CSA : public SXSI::TextCollection {
  public:
@@ -58,7 +65,12 @@ public:
      void MakeStatic();
      bool EmptyText(DocId) const;
      uchar* GetText(DocId) const;
-    uchar* GetText(DocId, TextPosition, TextPosition) const;
+    /**
+     * Next method is not supported:
+     * Supporting GetText for some substring [i,j]
+     * would require more space.
+     */
+//    uchar* GetText(DocId, TextPosition, TextPosition) const;
  
      bool IsPrefix(uchar const *) const;
      bool IsSuffix(uchar const *) const;
@@ -66,6 +78,7 @@ public:
      bool IsContains(uchar const *) const;
      bool IsLessThan(uchar const *) const;
  
+    ulong Count(uchar const *) const;
      unsigned CountPrefix(uchar const *) const;
      unsigned CountSuffix(uchar const *) const;
      unsigned CountEqual(uchar const *) const;
@@ -86,10 +99,32 @@ public:
      void Load(FILE *, unsigned);
      void Save(FILE *) const;
  
-    // debug
-    TextPosition Lookup(TextPosition) const;
+
+    // FIXME Remove
+    void deleteWT()
+    {
+        delete alphabetrank;
+        alphabetrank = 0;
+        delete [] codetable;
+        codetable = 0;
+    }
+    void deleteSamples()
+    {
+        delete sampled;
+        sampled =0;
+        delete suffixes;
+        suffixes = 0;
+        delete suffixDocId;
+        suffixDocId = 0;
+    }
+    void deleteEndmarker()
+    {
+        delete endmarkerDocId;
+        endmarkerDocId = 0;
+    }
  
  private:
+    // FIXME Unused code
      class TCodeEntry {
      public:
          unsigned count;
@@ -99,6 +134,7 @@ private:
      };   
       
  
+    // FIXME Unused code
      class THuffAlphabetRank {
      // using fixed 0...255 alphabet
      private:
@@ -110,7 +146,10 @@ private:
          bool leaf;
      public:
          THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+        THuffAlphabetRank(FILE *);
          ~THuffAlphabetRank();
+        
+        void Save(FILE *);
          bool Test(uchar *, TextPosition);
          
          inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
@@ -151,7 +190,7 @@ private:
              } 
              return true;
          }
-        inline int charAtPos(TextPosition i) const {
+        inline uchar access(TextPosition i) const {
              THuffAlphabetRank const * temp=this;
              while (!temp->leaf) {
                  if (temp->bitrank->IsBitSet(i)) {
@@ -163,10 +202,26 @@ private:
                      temp = temp->left;      
              }         
              }
-            return (int)temp->ch;
+            return temp->ch;
+        }
+
+        inline uchar charAtPos(ulong i, TextPosition *rank) const {
+            THuffAlphabetRank const * temp=this;
+            while (!temp->leaf) {
+                if (temp->bitrank->IsBitSet(i)) {
+                    i = temp->bitrank->rank(i)-1;
+                    temp = temp->right;
+                } else {
+                    i = i-temp->bitrank->rank(i);
+                    temp = temp->left;
+                }
+            }
+            (*rank)=i;
+            return temp->ch;
          }
      };
  
+    // FIXME Unused code
      class node {
      private:
          unsigned weight;
@@ -200,48 +255,70 @@ private:
          static TCodeEntry *makecodetable(uchar *, TextPosition);
      };
      
-    static const unsigned char print = 1;
-    static const unsigned char report = 1;
+    static const uchar versionFlag;
      TextPosition n;
      unsigned samplerate;
      unsigned C[256];
      TextPosition bwtEndPos;
-    THuffAlphabetRank *alphabetrank;
-    BitRank *sampled; 
-    BlockArray *suffixes;
-    BlockArray *positions;
-    TCodeEntry *codetable;
+//    THuffAlphabetRank *alphabetrank;
+    //    RLWaveletTree *alphabetrank;
+    static_sequence * alphabetrank;
+
+#ifdef CSA_TEST_BWT
      DynFMI *dynFMI;
+#endif
+    TCodeEntry *codetable;
+    
+    // Sample structures for texts longer than samplerate
+    BSGAP *sampled; 
+    BlockArray *suffixes;
+    BlockArray *suffixDocId;
  
      // Total number of texts in the collection
      unsigned numberOfTexts;
+    // Total number of texts including empty texts
+    unsigned numberOfAllTexts;
      // Length of the longest text
      ulong maxTextLength;
-    
-    
+
      // Array of document id's in the order of end-markers in BWT
      // Access by endmarkerDocId[rank_$(L, p) - 1].
      BlockArray *endmarkerDocId;
-    // Array of text lengths (in the inserted order)
-    BlockArray *textLength;
-    // Array of text starting positions (in the inserted order)
-    BlockArray *textStartPos;
- 
-    // Private methods
+
+    // FIXME Replace with a more succinct data structure
+    // Note: Empty texts are already handled inside XMLTree class.
+    std::set<unsigned> emptyTextId;
+    BSGAP *emptyTextRank;
+
+    // FIXME A better solution?
+    std::string texts;
+
+    // Following are not part of the public API
      uchar * BWT(uchar *);
-    uchar * LoadFromFile(const char *);
-    void SaveToFile(const char *, uchar *);
      void makewavelet(uchar *);
      void maketables();
-
-    // Following are not part of the public API
-    DocId DocIdAtTextPos(TextPosition) const;
-    unsigned CountEndmarkers(TextPosition, TextPosition) const;
+    DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-    TextPosition Inverse(TextPosition) const;
-    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
-    TextPosition Psi(TextPosition) const;
-    uchar * Substring(TextPosition, TextPosition) const;
+//    TextPosition Inverse(TextPosition) const;
+//    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
+//    TextPosition Psi(TextPosition) const;
+//    uchar * Substring(TextPosition, TextPosition) const;
+//    TextPosition Lookup(TextPosition) const;
+
+    /**
+     * Count end-markers in given interval
+     */
+    inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+    {
+        if (sp > ep)
+            return 0;
+
+        ulong ranksp = 0;
+        if (sp != 0)
+            ranksp = alphabetrank->rank(0, sp - 1);
+    
+        return alphabetrank->rank(0, ep) - ranksp;
+    }
  };
  
  #endif