#include "BitRank.h"
#include "TextCollection.h"
#include "BlockArray.h"
-#include "BSGAP.h"
// Include from XMLTree/libcds
#include <basics.h> // Defines W == 32
#include <static_bitsequence.h>
#include <alphabet_mapper.h>
#include <static_sequence.h>
+#include <static_sequence_wvtree_noptrs.h>
// Re-define word size to ulong:
#undef W
#endif
#undef bitset
+#include "TextStorage.h"
+
+
namespace SXSI
{
-// Un-comment to compare BWT against a BWT generated from class dynFMI:
-//#define CSA_TEST_BWT
/**
* Implementation of the TextCollection interface
~TCImplementation();
bool EmptyText(DocId) const;
- uchar* GetText(DocId) const;
+
+ /**
+ * Returns a pointer to the original text.
+ *
+ * Do *not* try to free the array.
+ * (However, this implementation is suspect to change.)
+ *
+ * See TextStorage.h for details.
+ */
+ uchar * GetText(DocId) const;
+
+ /**
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Do *not* try to free the array.
+ * (However, this implementation is suspect to change.)
+ *
+ * See TextStorage.h for details.
+ */
+ uchar * GetText(DocId i, DocId j) const;
+
/**
- * Next method is not supported:
- * Supporting GetText for some substring [i,j]
- * would require more space.
+ * Returns a substring of given text ID.
+ *
+ * FIXME This may be reimplemented via TextStorage.
*/
// uchar* GetText(DocId, TextPosition, TextPosition) const;
static_sequence * alphabetrank;
// Sample structures for texts longer than samplerate
- BSGAP *sampled; // FIXME Replace with RRR02
+ static_bitsequence * sampled;
BlockArray *suffixes;
BlockArray *suffixDocId;
// Total number of texts in the collection
unsigned numberOfTexts;
- // Total number of texts including empty texts
- unsigned numberOfAllTexts;
// Length of the longest text
ulong maxTextLength;
// Array of document id's in the order of end-markers in BWT
- // Access by endmarkerDocId[rank_$(L, p) - 1].
- BlockArray *endmarkerDocId;
+ static_sequence *Doc;
- // FIXME Replace with a more succinct data structure
- // Note: Empty texts are already handled inside XMLTree class.
- BSGAP *emptyTextRank; // FIXME Remove
+ // Text storage for fast extraction
+ TextStorage * textStorage;
- // Following are not part of the public API
+ // Following methods are not part of the public API
uchar * BWT(uchar *);
void makewavelet(uchar *);
void maketables();
return alphabetrank->rank(0, ep) - ranksp;
}
- unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const;
+
+ /**
+ * Count end-markers in given interval and
+ * within docIds [min,max]
+ */
+ inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
+ {
+ if (sp != 0)
+ sp = alphabetrank->rank(0, sp - 1);
+ ep = alphabetrank->rank(0, ep);
+ if (ep == 0)
+ return 0;
+
+ return Doc->count(sp, ep-1, min, max);
+ }
+
+ /**
+ * Enumerate all end-markers in given interval
+ */
+ inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
+ {
+ if (sp != 0)
+ sp = alphabetrank->rank(0, sp - 1);
+ ep = alphabetrank->rank(0, ep);
+ if (ep == 0)
+ return document_result();
+
+ return Doc->accessAll(sp, ep-1);
+ }
+
+ /**
+ * Enumerate end-markers in given interval and
+ * within docIds [min,max]
+ */
+ inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
+ {
+ if (sp != 0)
+ sp = alphabetrank->rank(0, sp - 1);
+ ep = alphabetrank->rank(0, ep);
+ if (ep == 0)
+ return document_result();
+
+ return Doc->access(sp, ep-1, min, max);
+ }
}; // class TCImplementation
} // namespace SXSI