BWT via Difference cover

author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Fri, 6 Mar 2009 13:39:30 +0000 (13:39 +0000)

committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>

Fri, 6 Mar 2009 13:39:30 +0000 (13:39 +0000)
author nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Fri, 6 Mar 2009 13:39:30 +0000 (13:39 +0000)
committer nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
Fri, 6 Mar 2009 13:39:30 +0000 (13:39 +0000)
diff --git a/CSA.h b/CSA.h

index df56585..c0751cf 100644 (file)
--- a/CSA.h
+++ b/CSA.h
@@ -29,7 +29,7 @@
  #include <vector>
  
  // Include  from XMLTree/libcds
-#include <basics.h>
+#include <basics.h> // Defines W == 32
  #include <static_bitsequence.h>
  #include <alphabet_mapper.h>
  #include <static_sequence.h>
@@ -43,21 +43,12 @@
  #endif
  #undef bitset
  
+// Un-comment to compare BWT against a BWT generated from class dynFMI:
+//#define CSA_TEST_BWT
  
  /**
   * Implementation of the TextCollection interface
   *
- * Implementation notes:
- *
- * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
- * which requires O(n log \sigma) bits of memory. If we know the distribution
- * of characters beforehand, space can easily be made O(nH_0).
- * 
- * The method MakeStatic() constructs a Succinct Suffix Array with a huffman 
- * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
- *
- * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
- * with any FM-index variant supporting LF-mapping.
   */
  class CSA : public SXSI::TextCollection {
  public:
@@ -109,7 +100,7 @@ public:
      void Save(FILE *) const;
  
  
-    // Debug FIXME Remove
+    // FIXME Remove
      void deleteWT()
      {
          delete alphabetrank;
@@ -123,8 +114,6 @@ public:
          sampled =0;
          delete suffixes;
          suffixes = 0;
-        delete positions;
-        positions = 0;
          delete suffixDocId;
          suffixDocId = 0;
      }
@@ -266,9 +255,6 @@ private:
          static TCodeEntry *makecodetable(uchar *, TextPosition);
      };
      
-    // FIXME Unused code
-    static const unsigned char print = 1;
-    static const unsigned char report = 1;
      static const uchar versionFlag;
      TextPosition n;
      unsigned samplerate;
@@ -277,12 +263,16 @@ private:
  //    THuffAlphabetRank *alphabetrank;
      //    RLWaveletTree *alphabetrank;
      static_sequence * alphabetrank;
+
+#ifdef CSA_TEST_BWT
+    DynFMI *dynFMI;
+#endif
+    TCodeEntry *codetable;
+    
+    // Sample structures for texts longer than samplerate
      BSGAP *sampled; 
      BlockArray *suffixes;
      BlockArray *suffixDocId;
-    BlockArray *positions;
-    TCodeEntry *codetable;
-    DynFMI *dynFMI;
  
      // Total number of texts in the collection
      unsigned numberOfTexts;
@@ -294,30 +284,26 @@ private:
      // Array of document id's in the order of end-markers in BWT
      // Access by endmarkerDocId[rank_$(L, p) - 1].
      BlockArray *endmarkerDocId;
-    // Array of text lengths (in the inserted order)
-    BlockArray *textLength;
-    // Array of text starting positions (in the inserted order)
-    BlockArray *textStartPos;
  
      // FIXME Replace with a more succinct data structure
+    // Note: Empty texts are already handled inside XMLTree class.
      std::set<unsigned> emptyTextId;
      BSGAP *emptyTextRank;
  
-    // Private methods
+    // FIXME A better solution?
+    std::string texts;
+
+    // Following are not part of the public API
      uchar * BWT(uchar *);
-    uchar * LoadFromFile(const char *);
-    void SaveToFile(const char *, uchar *);
      void makewavelet(uchar *);
      void maketables();
-
-    // Following are not part of the public API
-    DocId DocIdAtTextPos(TextPosition) const;
+    DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
      ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-    TextPosition Inverse(TextPosition) const;
-    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
-    TextPosition Psi(TextPosition) const;
-    uchar * Substring(TextPosition, TextPosition) const;
-    TextPosition Lookup(TextPosition) const;
+//    TextPosition Inverse(TextPosition) const;
+//    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
+//    TextPosition Psi(TextPosition) const;
+//    uchar * Substring(TextPosition, TextPosition) const;
+//    TextPosition Lookup(TextPosition) const;
  
      /**
       * Count end-markers in given interval
author	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Fri, 6 Mar 2009 13:39:30 +0000 (13:39 +0000)
committer	nvalimak <nvalimak@3cdefd35-fc62-479d-8e8d-bae585ffb9ca>
	Fri, 6 Mar 2009 13:39:30 +0000 (13:39 +0000)