X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;h=be9a2f9246f120bcae166441e01d068fe65df34b;hb=1ea86bc7fdc05347b46919374f071920fec8adb5;hp=ec6e890be2bffd93e39452af1bd4d33013ed46e0;hpb=9df50e6e5e58f746e83b9ad1d539024f82fa8b64;p=SXSI%2FTextCollection.git

diff --git a/CSA.h b/CSA.h
index ec6e890..be9a2f9 100644
--- a/CSA.h
+++ b/CSA.h
@@ -25,11 +25,12 @@
 #include "TextCollection.h"
 #include "BlockArray.h"
 #include "RLWaveletTree.h"
+#include "StringIterator.h"
 #include <set>
 #include <vector>
 
 // Include  from XMLTree/libcds
-#include <basics.h>
+#include <basics.h> // Defines W == 32
 #include <static_bitsequence.h>
 #include <alphabet_mapper.h>
 #include <static_sequence.h>
@@ -41,22 +42,17 @@
 #else
 #   define W 32
 #endif
+#undef bitset
 
+namespace SXSI 
+{
+
+// Un-comment to compare BWT against a BWT generated from class dynFMI:
+//#define CSA_TEST_BWT
 
 /**
  * Implementation of the TextCollection interface
  *
- * Implementation notes:
- *
- * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
- * which requires O(n log \sigma) bits of memory. If we know the distribution
- * of characters beforehand, space can easily be made O(nH_0).
- * 
- * The method MakeStatic() constructs a Succinct Suffix Array with a huffman 
- * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
- *
- * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
- * with any FM-index variant supporting LF-mapping.
  */
 class CSA : public SXSI::TextCollection {
 public:
@@ -73,6 +69,11 @@ public:
     void MakeStatic();
     bool EmptyText(DocId) const;
     uchar* GetText(DocId) const;
+    /**
+     * Next method is not supported:
+     * Supporting GetText for some substring [i,j]
+     * would require more space.
+     */
 //    uchar* GetText(DocId, TextPosition, TextPosition) const;
 
     bool IsPrefix(uchar const *) const;
@@ -81,6 +82,7 @@ public:
     bool IsContains(uchar const *) const;
     bool IsLessThan(uchar const *) const;
 
+    ulong Count(uchar const *) const;
     unsigned CountPrefix(uchar const *) const;
     unsigned CountSuffix(uchar const *) const;
     unsigned CountEqual(uchar const *) const;
@@ -102,7 +104,7 @@ public:
     void Save(FILE *) const;
 
 
-    // Debug FIXME Remove
+    // FIXME Remove
     void deleteWT()
     {
         delete alphabetrank;
@@ -116,8 +118,6 @@ public:
         sampled =0;
         delete suffixes;
         suffixes = 0;
-        delete positions;
-        positions = 0;
         delete suffixDocId;
         suffixDocId = 0;
     }
@@ -128,6 +128,7 @@ public:
     }
 
 private:
+    // FIXME Unused code
     class TCodeEntry {
     public:
         unsigned count;
@@ -137,6 +138,7 @@ private:
     };   
      
 
+    // FIXME Unused code
     class THuffAlphabetRank {
     // using fixed 0...255 alphabet
     private:
@@ -223,6 +225,7 @@ private:
         }
     };
 
+    // FIXME Unused code
     class node {
     private:
         unsigned weight;
@@ -256,8 +259,6 @@ private:
         static TCodeEntry *makecodetable(uchar *, TextPosition);
     };
     
-    static const unsigned char print = 1;
-    static const unsigned char report = 1;
     static const uchar versionFlag;
     TextPosition n;
     unsigned samplerate;
@@ -266,12 +267,16 @@ private:
 //    THuffAlphabetRank *alphabetrank;
     //    RLWaveletTree *alphabetrank;
     static_sequence * alphabetrank;
+
+#ifdef CSA_TEST_BWT
+    DynFMI *dynFMI;
+#endif
+    TCodeEntry *codetable;
+    
+    // Sample structures for texts longer than samplerate
     BSGAP *sampled; 
     BlockArray *suffixes;
     BlockArray *suffixDocId;
-    BlockArray *positions;
-    TCodeEntry *codetable;
-    DynFMI *dynFMI;
 
     // Total number of texts in the collection
     unsigned numberOfTexts;
@@ -283,30 +288,27 @@ private:
     // Array of document id's in the order of end-markers in BWT
     // Access by endmarkerDocId[rank_$(L, p) - 1].
     BlockArray *endmarkerDocId;
-    // Array of text lengths (in the inserted order)
-    BlockArray *textLength;
-    // Array of text starting positions (in the inserted order)
-    BlockArray *textStartPos;
 
     // FIXME Replace with a more succinct data structure
+    // Note: Empty texts are already handled inside XMLTree class.
     std::set<unsigned> emptyTextId;
     BSGAP *emptyTextRank;
 
-    // Private methods
+    // FIXME A better solution?
+    std::string texts;
+
+    // Following are not part of the public API
     uchar * BWT(uchar *);
-    uchar * LoadFromFile(const char *);
-    void SaveToFile(const char *, uchar *);
     void makewavelet(uchar *);
     void maketables();
-
-    // Following are not part of the public API
-    DocId DocIdAtTextPos(TextPosition) const;
+    DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-    TextPosition Inverse(TextPosition) const;
-    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
-    TextPosition Psi(TextPosition) const;
-    uchar * Substring(TextPosition, TextPosition) const;
-    TextPosition Lookup(TextPosition) const;
+    ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+//    TextPosition Inverse(TextPosition) const;
+//    TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
+//    TextPosition Psi(TextPosition) const;
+//    uchar * Substring(TextPosition, TextPosition) const;
+//    TextPosition Lookup(TextPosition) const;
 
     /**
      * Count end-markers in given interval
@@ -322,6 +324,8 @@ private:
     
         return alphabetrank->rank(0, ep) - ranksp;
     }
-};
+}; // class CSA
+
+} // namespace SXSI
 
 #endif