#include <vector>
// Include from XMLTree/libcds
-#include <basics.h>
+#include <basics.h> // Defines W == 32
#include <static_bitsequence.h>
#include <alphabet_mapper.h>
#include <static_sequence.h>
#endif
#undef bitset
+// Un-comment to compare BWT against a BWT generated from class dynFMI:
+//#define CSA_TEST_BWT
/**
* Implementation of the TextCollection interface
*
- * Implementation notes:
- *
- * Dynamic construction (via InsertText()) uses dynamic (balanced) wavelet tree
- * which requires O(n log \sigma) bits of memory. If we know the distribution
- * of characters beforehand, space can easily be made O(nH_0).
- *
- * The method MakeStatic() constructs a Succinct Suffix Array with a huffman
- * shaped wavelet tree. This structure achieves only zero'th order entropy. (FIXME)
- *
- * Query methods Prefix, Suffix, Equal, Contains and LessThan can be used
- * with any FM-index variant supporting LF-mapping.
*/
class CSA : public SXSI::TextCollection {
public:
void Save(FILE *) const;
- // Debug FIXME Remove
+ // FIXME Remove
void deleteWT()
{
delete alphabetrank;
sampled =0;
delete suffixes;
suffixes = 0;
- delete positions;
- positions = 0;
delete suffixDocId;
suffixDocId = 0;
}
static TCodeEntry *makecodetable(uchar *, TextPosition);
};
- // FIXME Unused code
- static const unsigned char print = 1;
- static const unsigned char report = 1;
static const uchar versionFlag;
TextPosition n;
unsigned samplerate;
// THuffAlphabetRank *alphabetrank;
// RLWaveletTree *alphabetrank;
static_sequence * alphabetrank;
+
+#ifdef CSA_TEST_BWT
+ DynFMI *dynFMI;
+#endif
+ TCodeEntry *codetable;
+
+ // Sample structures for texts longer than samplerate
BSGAP *sampled;
BlockArray *suffixes;
BlockArray *suffixDocId;
- BlockArray *positions;
- TCodeEntry *codetable;
- DynFMI *dynFMI;
// Total number of texts in the collection
unsigned numberOfTexts;
// Array of document id's in the order of end-markers in BWT
// Access by endmarkerDocId[rank_$(L, p) - 1].
BlockArray *endmarkerDocId;
- // Array of text lengths (in the inserted order)
- BlockArray *textLength;
- // Array of text starting positions (in the inserted order)
- BlockArray *textStartPos;
// FIXME Replace with a more succinct data structure
+ // Note: Empty texts are already handled inside XMLTree class.
std::set<unsigned> emptyTextId;
BSGAP *emptyTextRank;
- // Private methods
+ // FIXME A better solution?
+ std::string texts;
+
+ // Following are not part of the public API
uchar * BWT(uchar *);
- uchar * LoadFromFile(const char *);
- void SaveToFile(const char *, uchar *);
void makewavelet(uchar *);
void maketables();
-
- // Following are not part of the public API
- DocId DocIdAtTextPos(TextPosition) const;
+ DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
- TextPosition Inverse(TextPosition) const;
- TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
- TextPosition Psi(TextPosition) const;
- uchar * Substring(TextPosition, TextPosition) const;
- TextPosition Lookup(TextPosition) const;
+// TextPosition Inverse(TextPosition) const;
+// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const;
+// TextPosition Psi(TextPosition) const;
+// uchar * Substring(TextPosition, TextPosition) const;
+// TextPosition Lookup(TextPosition) const;
/**
* Count end-markers in given interval