#include "BitRank.h"
#include "TextCollection.h"
#include "BlockArray.h"
-#include "BSGAP.h"
// Include from XMLTree/libcds
#include <basics.h> // Defines W == 32
#include <static_bitsequence.h>
#include <alphabet_mapper.h>
#include <static_sequence.h>
+#include <static_sequence_wvtree_noptrs.h>
// Re-define word size to ulong:
#undef W
#endif
#undef bitset
+#include "TextStorage.h"
+#include <set>
+
namespace SXSI
{
~TCImplementation();
bool EmptyText(DocId) const;
- uchar* GetText(DocId) const;
+
+ /**
+ * Returns a pointer to the original text.
+ *
+ * Do *not* try to free the array.
+ * (However, this implementation is suspect to change.)
+ *
+ * See TextStorage.h for details.
+ */
+ uchar * GetText(DocId) const;
+
/**
- * Next method is not supported:
- * Supporting GetText for some substring [i,j]
- * would require more space.
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Do *not* try to free the array.
+ * (However, this implementation is suspect to change.)
+ *
+ * See TextStorage.h for details.
+ */
+ uchar * GetText(DocId i, DocId j) const;
+
+ /**
+ * Returns a substring of given text ID.
+ *
+ * FIXME This may be reimplemented via TextStorage.
*/
// uchar* GetText(DocId, TextPosition, TextPosition) const;
document_result Equal(uchar const *) const;
document_result Contains(uchar const *) const;
document_result LessThan(uchar const *) const;
+ document_result Kmismaches(uchar const *, unsigned) const;
+ document_result Kerrors(uchar const *, unsigned) const;
document_result Prefix(uchar const *, DocId, DocId) const;
document_result Suffix(uchar const *, DocId, DocId) const;
// Definition of full_result is inherited from SXSI::TextCollection.
full_result FullContains(uchar const *) const;
full_result FullContains(uchar const *, DocId, DocId) const;
+ full_result FullKmismatches(uchar const *, unsigned) const;
+ full_result FullKerrors(uchar const *, unsigned) const;
// Index from/to disk
TCImplementation(FILE *, unsigned);
void Save(FILE *) const;
private:
+ typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
+
static const uchar versionFlag;
TextPosition n;
unsigned samplerate;
static_sequence * alphabetrank;
// Sample structures for texts longer than samplerate
- BSGAP *sampled; // FIXME Replace with RRR02
+ static_bitsequence * sampled;
BlockArray *suffixes;
BlockArray *suffixDocId;
ulong maxTextLength;
// Array of document id's in the order of end-markers in BWT
- // Access by endmarkerDocId[rank_$(L, p) - 1].
static_sequence *Doc;
- // Following are not part of the public API
+ // Text storage for fast extraction
+ TextStorage * textStorage;
+
+ // Following methods are not part of the public API
uchar * BWT(uchar *);
void makewavelet(uchar *);
void maketables();
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
-
+ ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
+ ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
+ ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
/**
* Count end-markers in given interval
*/
return Doc->access(sp, ep-1, min, max);
}
+
+ /**
+ * Enumerate documents in given interval [sp, ep]
+ */
+ inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
+ {
+ // We want unique document indentifiers, using std::set to collect them
+ // FIXME use unordered_set?
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ resultSet.insert(Doc->access(endmarkerRank));
+ }
+ else
+ {
+ DocId di = (*suffixDocId)[sampled->rank1(i)-1];
+ assert((unsigned)di < numberOfTexts);
+ resultSet.insert(di);
+ }
+ }
+ }
+
+ /**
+ * Enumerate documents in given interval [sp, ep]
+ * and within [begin, end]
+ */
+ inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+ {
+ // We want unique document indentifiers, using std::set to collect them
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+ assert((unsigned)docId < numberOfTexts);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ }
+ }
+
+ /**
+ * Enumerate document+position pairs (full_result) of
+ * each suffix in given interval.
+ */
+ inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
+ {
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+ }
+
+ /**
+ * Enumerate document+position pairs (full_result) of
+ * each suffix in given interval and within [begin, end].
+ */
+ inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+ {
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+ if (docId >= begin && docId <= end)
+ result.push_back(make_pair(docId, textPos));
+ }
+ }
+ }
+
}; // class TCImplementation
} // namespace SXSI