#ifndef _CSA_H_
#define _CSA_H_
+#include "dynFMI.h"
+
+// Include from XMLTree/libcds
+#include <basics.h>
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
-#include <map>
+#include <set>
#include <vector>
#include "BitRank.h"
-#include "dynFMI.h"
#include "TextCollection.h"
+#include "BlockArray.h"
+#include "RLWaveletTree.h"
/**
* Implementation of the TextCollection interface
void MakeStatic();
bool EmptyText(DocId) const;
uchar* GetText(DocId) const;
- uchar* GetText(DocId, TextPosition, TextPosition) const;
+// uchar* GetText(DocId, TextPosition, TextPosition) const;
bool IsPrefix(uchar const *) const;
bool IsSuffix(uchar const *) const;
void Load(FILE *, unsigned);
void Save(FILE *) const;
+
+ // Debug FIXME Remove
+ void deleteWT()
+ {
+ delete alphabetrank;
+ alphabetrank = 0;
+ delete [] codetable;
+ codetable = 0;
+ }
+ void deleteSamples()
+ {
+ delete sampled;
+ sampled =0;
+ delete suffixes;
+ suffixes = 0;
+ delete positions;
+ positions = 0;
+ delete suffixDocId;
+ suffixDocId = 0;
+ }
+ void deleteEndmarker()
+ {
+ delete endmarkerDocId;
+ endmarkerDocId = 0;
+ }
+
private:
class TCodeEntry {
public:
bool leaf;
public:
THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned);
+ THuffAlphabetRank(FILE *);
~THuffAlphabetRank();
+
+ void Save(FILE *);
bool Test(uchar *, TextPosition);
- inline ulong rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
+ inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i
THuffAlphabetRank const * temp=this;
if (codetable[c].count == 0) return 0;
unsigned level = 0;
}
return true;
}
- inline int charAtPos(TextPosition i) const {
+ inline uchar access(TextPosition i) const {
THuffAlphabetRank const * temp=this;
while (!temp->leaf) {
if (temp->bitrank->IsBitSet(i)) {
temp = temp->left;
}
}
- return (int)temp->ch;
+ return temp->ch;
+ }
+
+ inline uchar charAtPos(ulong i, TextPosition *rank) const {
+ THuffAlphabetRank const * temp=this;
+ while (!temp->leaf) {
+ if (temp->bitrank->IsBitSet(i)) {
+ i = temp->bitrank->rank(i)-1;
+ temp = temp->right;
+ } else {
+ i = i-temp->bitrank->rank(i);
+ temp = temp->left;
+ }
+ }
+ (*rank)=i;
+ return temp->ch;
}
};
static const unsigned char print = 1;
static const unsigned char report = 1;
+ static const uchar versionFlag;
TextPosition n;
unsigned samplerate;
- ulong C[256];
+ unsigned C[256];
TextPosition bwtEndPos;
- THuffAlphabetRank *alphabetrank;
- BitRank *sampled;
- ulong *suffixes;
- ulong *positions;
+// THuffAlphabetRank *alphabetrank;
+ // RLWaveletTree *alphabetrank;
+ static_sequence * alphabetrank;
+ BSGAP *sampled;
+ BlockArray *suffixes;
+ BlockArray *suffixDocId;
+ BlockArray *positions;
TCodeEntry *codetable;
DynFMI *dynFMI;
- // Map from end-marker in BWT to pair (textId, sampled text position)
- std::map<TextPosition, std::pair<DocId, TextPosition> > endmarkers;
- // Vector of pairs of <text length, text start position>
- std::vector<std::pair<TextPosition, TextPosition> > textLength;
-
+
+ // Total number of texts in the collection
+ unsigned numberOfTexts;
+ // Total number of texts including empty texts
+ unsigned numberOfAllTexts;
+ // Length of the longest text
+ ulong maxTextLength;
+
+ // Array of document id's in the order of end-markers in BWT
+ // Access by endmarkerDocId[rank_$(L, p) - 1].
+ BlockArray *endmarkerDocId;
+ // Array of text lengths (in the inserted order)
+ BlockArray *textLength;
+ // Array of text starting positions (in the inserted order)
+ BlockArray *textStartPos;
+
+ // FIXME Replace with a more succinct data structure
+ std::set<unsigned> emptyTextId;
+ BSGAP *emptyTextRank;
+
// Private methods
uchar * BWT(uchar *);
uchar * LoadFromFile(const char *);
// Following are not part of the public API
DocId DocIdAtTextPos(TextPosition) const;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
- TextPosition Lookup(TextPosition) const;
TextPosition Inverse(TextPosition) const;
TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
TextPosition Psi(TextPosition) const;
uchar * Substring(TextPosition, TextPosition) const;
+ TextPosition Lookup(TextPosition) const;
+
+ /**
+ * Count end-markers in given interval
+ */
+ inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+ {
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ return alphabetrank->rank(0, ep) - ranksp;
+ }
};
#endif