X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;h=df56585064171bbc4336b81e30aa34c4439a17bc;hb=663cd2f6cc5e3796d001e8c527de0aea8c8bbf68;hp=00cf1c69f1a85c39f7f6e1be471303f3d198f5e1;hpb=56d77631b00f8622c211db6721cb087a5d29f0e2;p=SXSI%2FTextCollection.git

diff --git a/CSA.h b/CSA.h
index 00cf1c6..df56585 100644
--- a/CSA.h
+++ b/CSA.h
@@ -20,12 +20,29 @@
 
 #ifndef _CSA_H_
 #define _CSA_H_
-
-#include <map>
-#include <vector>
-#include "BitRank.h"
 #include "dynFMI.h"
+#include "BitRank.h"
 #include "TextCollection.h"
+#include "BlockArray.h"
+#include "RLWaveletTree.h"
+#include <set>
+#include <vector>
+
+// Include  from XMLTree/libcds
+#include <basics.h>
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
+
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+#   define W 64
+#else
+#   define W 32
+#endif
+#undef bitset
+
 
 /**
  * Implementation of the TextCollection interface
@@ -57,7 +74,12 @@ public:
     void MakeStatic();
     bool EmptyText(DocId) const;
     uchar* GetText(DocId) const;
-    uchar* GetText(DocId, TextPosition, TextPosition) const;
+    /**
+     * Next method is not supported:
+     * Supporting GetText for some substring [i,j]
+     * would require more space.
+     */
+//    uchar* GetText(DocId, TextPosition, TextPosition) const;
 
     bool IsPrefix(uchar const *) const;
     bool IsSuffix(uchar const *) const;
@@ -65,6 +87,7 @@ public:
     bool IsContains(uchar const *) const;
     bool IsLessThan(uchar const *) const;
 
+    ulong Count(uchar const *) const;
     unsigned CountPrefix(uchar const *) const;
     unsigned CountSuffix(uchar const *) const;
     unsigned CountEqual(uchar const *) const;
@@ -85,7 +108,34 @@ public:
     void Load(FILE *, unsigned);
     void Save(FILE *) const;
 
+
+    // Debug FIXME Remove
+    void deleteWT()
+    {
+        delete alphabetrank;
+        alphabetrank = 0;
+        delete [] codetable;
+        codetable = 0;
+    }
+    void deleteSamples()
+    {
+        delete sampled;
+        sampled =0;
+        delete suffixes;
+        suffixes = 0;
+        delete positions;
+        positions = 0;
+        delete suffixDocId;
+        suffixDocId = 0;
+    }
+    void deleteEndmarker()
+    {
+        delete endmarkerDocId;
+        endmarkerDocId = 0;
+    }
+
 private:
+    // FIXME Unused code
     class TCodeEntry {
     public:
         unsigned count;
@@ -95,6 +145,7 @@ private:
     };   
      
 
+    // FIXME Unused code
     class THuffAlphabetRank {
     // using fixed 0...255 alphabet
     private:
@@ -106,10 +157,13 @@ private:
         bool leaf;
     public:
         THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+        THuffAlphabetRank(FILE *);
         ~THuffAlphabetRank();
+        
+        void Save(FILE *);
         bool Test(uchar *, TextPosition);
         
-        inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
+        inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
             THuffAlphabetRank const * temp=this;
             if (codetable[c].count == 0) return 0;
             unsigned level = 0;
@@ -147,7 +201,7 @@ private:
             } 
             return true;
         }
-        inline int charAtPos(TextPosition i) const {
+        inline uchar access(TextPosition i) const {
             THuffAlphabetRank const * temp=this;
             while (!temp->leaf) {
                 if (temp->bitrank->IsBitSet(i)) {
@@ -159,10 +213,26 @@ private:
                     temp = temp->left;      
             }         
             }
-            return (int)temp->ch;
+            return temp->ch;
+        }
+
+        inline uchar charAtPos(ulong i, TextPosition *rank) const {
+            THuffAlphabetRank const * temp=this;
+            while (!temp->leaf) {
+                if (temp->bitrank->IsBitSet(i)) {
+                    i = temp->bitrank->rank(i)-1;
+                    temp = temp->right;
+                } else {
+                    i = i-temp->bitrank->rank(i);
+                    temp = temp->left;
+                }
+            }
+            (*rank)=i;
+            return temp->ch;
         }
     };
 
+    // FIXME Unused code
     class node {
     private:
         unsigned weight;
@@ -196,23 +266,43 @@ private:
         static TCodeEntry *makecodetable(uchar *, TextPosition);
     };
     
+    // FIXME Unused code
     static const unsigned char print = 1;
     static const unsigned char report = 1;
+    static const uchar versionFlag;
     TextPosition n;
     unsigned samplerate;
-    ulong C[256];
+    unsigned C[256];
     TextPosition bwtEndPos;
-    THuffAlphabetRank *alphabetrank;
-    BitRank *sampled; 
-    ulong *suffixes;
-    ulong *positions;
+//    THuffAlphabetRank *alphabetrank;
+    //    RLWaveletTree *alphabetrank;
+    static_sequence * alphabetrank;
+    BSGAP *sampled; 
+    BlockArray *suffixes;
+    BlockArray *suffixDocId;
+    BlockArray *positions;
     TCodeEntry *codetable;
     DynFMI *dynFMI;
-    // Map from end-marker in BWT to pair (textId, sampled text position)
-    std::map<TextPosition, std::pair<DocId, TextPosition> > endmarkers;
-    // Vector of pairs of <text length, text start position>
-    std::vector<std::pair<TextPosition, TextPosition> > textLength;
-    
+
+    // Total number of texts in the collection
+    unsigned numberOfTexts;
+    // Total number of texts including empty texts
+    unsigned numberOfAllTexts;
+    // Length of the longest text
+    ulong maxTextLength;
+
+    // Array of document id's in the order of end-markers in BWT
+    // Access by endmarkerDocId[rank_$(L, p) - 1].
+    BlockArray *endmarkerDocId;
+    // Array of text lengths (in the inserted order)
+    BlockArray *textLength;
+    // Array of text starting positions (in the inserted order)
+    BlockArray *textStartPos;
+
+    // FIXME Replace with a more succinct data structure
+    std::set<unsigned> emptyTextId;
+    BSGAP *emptyTextRank;
+
     // Private methods
     uchar * BWT(uchar *);
     uchar * LoadFromFile(const char *);
@@ -223,11 +313,26 @@ private:
     // Following are not part of the public API
     DocId DocIdAtTextPos(TextPosition) const;
     ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-    TextPosition Lookup(TextPosition) const;
     TextPosition Inverse(TextPosition) const;
     TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
     TextPosition Psi(TextPosition) const;
     uchar * Substring(TextPosition, TextPosition) const;
+    TextPosition Lookup(TextPosition) const;
+
+    /**
+     * Count end-markers in given interval
+     */
+    inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+    {
+        if (sp > ep)
+            return 0;
+
+        ulong ranksp = 0;
+        if (sp != 0)
+            ranksp = alphabetrank->rank(0, sp - 1);
+    
+        return alphabetrank->rank(0, ep) - ranksp;
+    }
 };
 
 #endif