--- /dev/null
+/******************************************************************************
+ * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
+ * *
+ * FMIndex implementation for the TextCollection interface *
+ * *
+ * This program is free software; you can redistribute it and/or modify *
+ * it under the terms of the GNU Lesser General Public License as published *
+ * by the Free Software Foundation; either version 2 of the License, or *
+ * (at your option) any later version. *
+ * *
+ * This program is distributed in the hope that it will be useful, *
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of *
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
+ * GNU Lesser General Public License for more details. *
+ * *
+ * You should have received a copy of the GNU Lesser General Public License *
+ * along with this program; if not, write to the *
+ * Free Software Foundation, Inc., *
+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
+ *****************************************************************************/
+
+#ifndef _FMIndex_H_
+#define _FMIndex_H_
+
+#include "incbwt/bits/deltavector.h"
+
+#include "BitRank.h"
+#include "TextCollection.h"
+#include "BlockArray.h"
+
+// Include from XMLTree/libcds
+#include <basics.h> // Defines W == 32
+#include <static_bitsequence.h>
+#include <alphabet_mapper.h>
+#include <static_sequence.h>
+#include <static_sequence_wvtree_noptrs.h>
+
+// Re-define word size to ulong:
+#undef W
+#if __WORDSIZE == 64
+# define W 64
+#else
+# define W 32
+#endif
+#undef bitset
+#undef bitget
+
+
+#include "TextStorage.h"
+#include "ArrayDoc.h"
+#include <set>
+#include <string>
+
+namespace SXSI
+{
+
+
+/**
+ * Implementation of the TextCollection interface
+ *
+ */
+class FMIndex : public SXSI::TextCollection {
+public:
+ FMIndex(uchar *, ulong, unsigned, unsigned, ulong, ulong,
+ CSA::DeltaVector &, const std::string &, char);
+ ~FMIndex();
+
+ bool EmptyText(DocId) const;
+
+ /**
+ * Extracting one text.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ uchar * GetText(DocId) const;
+ void DeleteText(uchar *text) const
+ { textStorage->DeleteText(text); }
+
+ /**
+ * Returns a pointer to the beginning of texts i, i+1, ..., j.
+ * Texts are separated by a '\0' byte.
+ *
+ * Call DeleteText() for each pointer returned by GetText()
+ * to avoid possible memory leaks.
+ */
+ uchar * GetText(DocId i, DocId j) const
+ { return textStorage->GetText(i, j); }
+
+ /**
+ * Returns a substring of given text ID.
+ *
+ * FIXME This may be reimplemented via TextStorage.
+ */
+// uchar* GetText(DocId, TextPosition, TextPosition) const;
+
+ bool IsPrefix(uchar const *) const;
+ bool IsSuffix(uchar const *) const;
+ bool IsEqual(uchar const *) const;
+ bool IsContains(uchar const *) const;
+ bool IsLessThan(uchar const *) const;
+
+ bool IsPrefix(uchar const *, DocId, DocId) const;
+ bool IsSuffix(uchar const *, DocId, DocId) const;
+ bool IsEqual(uchar const *, DocId, DocId) const;
+ bool IsContains(uchar const *, DocId, DocId) const;
+ bool IsLessThan(uchar const *, DocId, DocId) const;
+
+ ulong Count(uchar const *) const;
+ unsigned CountPrefix(uchar const *) const;
+ unsigned CountSuffix(uchar const *) const;
+ unsigned CountEqual(uchar const *) const;
+ unsigned CountContains(uchar const *) const;
+ unsigned CountLessThan(const unsigned char*) const;
+
+ unsigned CountPrefix(uchar const *, DocId, DocId) const;
+ unsigned CountSuffix(uchar const *, DocId, DocId) const;
+ unsigned CountEqual(uchar const *, DocId, DocId) const;
+ unsigned CountContains(uchar const *, DocId, DocId) const;
+ unsigned CountLessThan(uchar const *, DocId, DocId) const;
+
+ // Definition of document_result is inherited from SXSI::TextCollection.
+ document_result Prefix(uchar const *) const;
+ document_result Suffix(uchar const *) const;
+ document_result Equal(uchar const *) const;
+ document_result Contains(uchar const *) const;
+ document_result LessThan(uchar const *) const;
+ document_result KMismaches(uchar const *, unsigned) const;
+ document_result KErrors(uchar const *, unsigned) const;
+
+ document_result Prefix(uchar const *, DocId, DocId) const;
+ document_result Suffix(uchar const *, DocId, DocId) const;
+ document_result Equal(uchar const *, DocId, DocId) const;
+ document_result Contains(uchar const *, DocId, DocId) const;
+ document_result LessThan(uchar const *, DocId, DocId) const;
+
+ // Definition of full_result is inherited from SXSI::TextCollection.
+ full_result FullContains(uchar const *) const;
+ full_result FullContains(uchar const *, DocId, DocId) const;
+ full_result FullKMismatches(uchar const *, unsigned) const;
+ full_result FullKErrors(uchar const *, unsigned) const;
+
+ // Index from/to disk
+ FMIndex(FILE *, index_mode_t, unsigned);
+ void Save(FILE *, char const *) const;
+
+private:
+ typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
+
+ static const uchar versionFlag;
+ TextPosition n;
+ unsigned samplerate;
+ unsigned C[256];
+ TextPosition bwtEndPos;
+ static_sequence * alphabetrank;
+
+ // Sample structures for texts longer than samplerate
+ static_bitsequence * sampled;
+ BlockArray *suffixes;
+ BlockArray *suffixDocId;
+
+ // Total number of texts in the collection
+ unsigned numberOfTexts;
+ // Length of the longest text
+ ulong maxTextLength;
+
+ // Array of document id's in the order of end-markers in BWT
+// static_sequence *Doc;
+ ArrayDoc *Doc;
+
+ // Text storage for fast extraction
+ TextStorage * textStorage;
+
+ // Following methods are not part of the public API
+ uchar * BWT(uchar *);
+ void makewavelet(uchar *);
+ void maketables(ulong, char, CSA::DeltaVector &, const std::string &);
+ DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
+ ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+ ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
+ ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
+ ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
+ ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
+ ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
+ /**
+ * Count end-markers in given interval
+ */
+ inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
+ {
+ if (sp > ep)
+ return 0;
+
+ ulong ranksp = 0;
+ if (sp != 0)
+ ranksp = alphabetrank->rank(0, sp - 1);
+
+ return alphabetrank->rank(0, ep) - ranksp;
+ }
+
+ /**
+ * Count end-markers in given interval and
+ * within docIds [min,max]
+ */
+ inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
+ {
+ if (sp != 0)
+ sp = alphabetrank->rank(0, sp - 1);
+ ep = alphabetrank->rank(0, ep);
+ if (ep == 0)
+ return 0;
+
+ return Doc->count(sp, ep-1, min, max);
+ }
+
+ /**
+ * Enumerate all end-markers in given interval
+ */
+ inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
+ {
+ if (sp != 0)
+ sp = alphabetrank->rank(0, sp - 1);
+ ep = alphabetrank->rank(0, ep);
+ if (ep == 0)
+ return document_result();
+
+ return Doc->accessAll(sp, ep-1);
+ }
+
+ /**
+ * Enumerate end-markers in given interval and
+ * within docIds [min,max]
+ */
+ inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
+ {
+ if (sp != 0)
+ sp = alphabetrank->rank(0, sp - 1);
+ ep = alphabetrank->rank(0, ep);
+ if (ep == 0)
+ return document_result();
+
+ return Doc->access(sp, ep-1, min, max);
+ }
+
+ /**
+ * Enumerate documents in given interval [sp, ep]
+ */
+ inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
+ {
+ // We want unique document indentifiers, using std::set to collect them
+ // FIXME use unordered_set?
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ resultSet.insert(Doc->access(endmarkerRank));
+ }
+ else
+ {
+ DocId di = (*suffixDocId)[sampled->rank1(i)-1];
+ assert((unsigned)di < numberOfTexts);
+ resultSet.insert(di);
+ }
+ }
+ }
+
+ /**
+ * Enumerate documents in given interval [sp, ep]
+ * and within [begin, end]
+ */
+ inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+ {
+ // We want unique document indentifiers, using std::set to collect them
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ else
+ {
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+ assert((unsigned)docId < numberOfTexts);
+ if (docId >= begin && docId <= end)
+ resultSet.insert(docId);
+ }
+ }
+ }
+
+ /**
+ * Enumerate document+position pairs (full_result) of
+ * each suffix in given interval.
+ */
+ inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
+ {
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ result.push_back(std::make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+ result.push_back(std::make_pair(docId, textPos));
+ }
+ }
+ }
+
+ /**
+ * Enumerate document+position pairs (full_result) of
+ * each suffix in given interval and within [begin, end].
+ */
+ inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
+ {
+ uint tmp_rank_c = 0; // Cache rank value of c.
+ for (; sp <= ep; ++sp)
+ {
+ TextPosition i = sp;
+ TextPosition dist = 0;
+ uchar c = alphabetrank->access(i, tmp_rank_c);
+ while (c != '\0' && !sampled->access(i))
+ {
+ i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
+ c = alphabetrank->access(i, tmp_rank_c);
+ ++ dist;
+ }
+ if (c == '\0')
+ {
+ // Rank among the end-markers in BWT
+ unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
+ DocId docId = Doc->access(endmarkerRank);
+ if (docId >= begin && docId <= end)
+ result.push_back(std::make_pair(docId, dist));
+ }
+ else
+ {
+ TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
+ DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
+
+ if (docId >= begin && docId <= end)
+ result.push_back(std::make_pair(docId, textPos));
+ }
+ }
+ }
+
+}; // class FMIndex
+
+} // namespace SXSI
+
+#endif