+++ /dev/null
-/******************************************************************************
- * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki *
- * *
- * *
- * This program is free software; you can redistribute it and/or modify *
- * it under the terms of the GNU Lesser General Public License as published *
- * by the Free Software Foundation; either version 2 of the License, or *
- * (at your option) any later version. *
- * *
- * This program is distributed in the hope that it will be useful, *
- * but WITHOUT ANY WARRANTY; without even the implied warranty of *
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
- * GNU Lesser General Public License for more details. *
- * *
- * You should have received a copy of the GNU Lesser General Public License *
- * along with this program; if not, write to the *
- * Free Software Foundation, Inc., *
- * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. *
- *****************************************************************************/
-
-#ifndef _TCImplementation_H_
-#define _TCImplementation_H_
-
-#include "incbwt/bits/deltavector.h"
-
-#include "BitRank.h"
-#include "TextCollection.h"
-#include "BlockArray.h"
-
-// Include from XMLTree/libcds
-#include <basics.h> // Defines W == 32
-#include <static_bitsequence.h>
-#include <alphabet_mapper.h>
-#include <static_sequence.h>
-#include <static_sequence_wvtree_noptrs.h>
-
-// Re-define word size to ulong:
-#undef W
-#if __WORDSIZE == 64
-# define W 64
-#else
-# define W 32
-#endif
-#undef bitset
-#undef bitget
-
-
-#include "TextStorage.h"
-#include "ArrayDoc.h"
-#include <set>
-#include <string>
-
-namespace SXSI
-{
-
-
-/**
- * Implementation of the TextCollection interface
- *
- */
-class TCImplementation : public SXSI::TextCollection {
-public:
- TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong,
- CSA::DeltaVector &, const std::string &, char);
- ~TCImplementation();
-
- bool EmptyText(DocId) const;
-
- /**
- * Extracting one text.
- *
- * Call DeleteText() for each pointer returned by GetText()
- * to avoid possible memory leaks.
- */
- uchar * GetText(DocId) const;
- void DeleteText(uchar *text) const
- { textStorage->DeleteText(text); }
-
- /**
- * Returns a pointer to the beginning of texts i, i+1, ..., j.
- * Texts are separated by a '\0' byte.
- *
- * Call DeleteText() for each pointer returned by GetText()
- * to avoid possible memory leaks.
- */
- uchar * GetText(DocId i, DocId j) const
- { return textStorage->GetText(i, j); }
-
- /**
- * Returns a substring of given text ID.
- *
- * FIXME This may be reimplemented via TextStorage.
- */
-// uchar* GetText(DocId, TextPosition, TextPosition) const;
-
- bool IsPrefix(uchar const *) const;
- bool IsSuffix(uchar const *) const;
- bool IsEqual(uchar const *) const;
- bool IsContains(uchar const *) const;
- bool IsLessThan(uchar const *) const;
-
- bool IsPrefix(uchar const *, DocId, DocId) const;
- bool IsSuffix(uchar const *, DocId, DocId) const;
- bool IsEqual(uchar const *, DocId, DocId) const;
- bool IsContains(uchar const *, DocId, DocId) const;
- bool IsLessThan(uchar const *, DocId, DocId) const;
-
- ulong Count(uchar const *) const;
- unsigned CountPrefix(uchar const *) const;
- unsigned CountSuffix(uchar const *) const;
- unsigned CountEqual(uchar const *) const;
- unsigned CountContains(uchar const *) const;
- unsigned CountLessThan(const unsigned char*) const;
-
- unsigned CountPrefix(uchar const *, DocId, DocId) const;
- unsigned CountSuffix(uchar const *, DocId, DocId) const;
- unsigned CountEqual(uchar const *, DocId, DocId) const;
- unsigned CountContains(uchar const *, DocId, DocId) const;
- unsigned CountLessThan(uchar const *, DocId, DocId) const;
-
- // Definition of document_result is inherited from SXSI::TextCollection.
- document_result Prefix(uchar const *) const;
- document_result Suffix(uchar const *) const;
- document_result Equal(uchar const *) const;
- document_result Contains(uchar const *) const;
- document_result LessThan(uchar const *) const;
- document_result KMismaches(uchar const *, unsigned) const;
- document_result KErrors(uchar const *, unsigned) const;
-
- document_result Prefix(uchar const *, DocId, DocId) const;
- document_result Suffix(uchar const *, DocId, DocId) const;
- document_result Equal(uchar const *, DocId, DocId) const;
- document_result Contains(uchar const *, DocId, DocId) const;
- document_result LessThan(uchar const *, DocId, DocId) const;
-
- // Definition of full_result is inherited from SXSI::TextCollection.
- full_result FullContains(uchar const *) const;
- full_result FullContains(uchar const *, DocId, DocId) const;
- full_result FullKMismatches(uchar const *, unsigned) const;
- full_result FullKErrors(uchar const *, unsigned) const;
-
- // Index from/to disk
- TCImplementation(FILE *, index_mode_t, unsigned);
- void Save(FILE *) const;
-
-private:
- typedef std::vector<std::pair<ulong, ulong> > suffix_range_vector;
-
- static const uchar versionFlag;
- TextPosition n;
- unsigned samplerate;
- unsigned C[256];
- TextPosition bwtEndPos;
- static_sequence * alphabetrank;
-
- // Sample structures for texts longer than samplerate
- static_bitsequence * sampled;
- BlockArray *suffixes;
- BlockArray *suffixDocId;
-
- // Total number of texts in the collection
- unsigned numberOfTexts;
- // Length of the longest text
- ulong maxTextLength;
-
- // Array of document id's in the order of end-markers in BWT
-// static_sequence *Doc;
- ArrayDoc *Doc;
-
- // Text storage for fast extraction
- TextStorage * textStorage;
-
- // Following methods are not part of the public API
- uchar * BWT(uchar *);
- void makewavelet(uchar *);
- void maketables(ulong, char, CSA::DeltaVector &, const std::string &);
- DocId DocIdAtTextPos(BlockArray*, TextPosition) const;
- ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
- ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const;
- ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const;
- ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const;
- ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const;
- ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const;
- /**
- * Count end-markers in given interval
- */
- inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const
- {
- if (sp > ep)
- return 0;
-
- ulong ranksp = 0;
- if (sp != 0)
- ranksp = alphabetrank->rank(0, sp - 1);
-
- return alphabetrank->rank(0, ep) - ranksp;
- }
-
- /**
- * Count end-markers in given interval and
- * within docIds [min,max]
- */
- inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
- {
- if (sp != 0)
- sp = alphabetrank->rank(0, sp - 1);
- ep = alphabetrank->rank(0, ep);
- if (ep == 0)
- return 0;
-
- return Doc->count(sp, ep-1, min, max);
- }
-
- /**
- * Enumerate all end-markers in given interval
- */
- inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const
- {
- if (sp != 0)
- sp = alphabetrank->rank(0, sp - 1);
- ep = alphabetrank->rank(0, ep);
- if (ep == 0)
- return document_result();
-
- return Doc->accessAll(sp, ep-1);
- }
-
- /**
- * Enumerate end-markers in given interval and
- * within docIds [min,max]
- */
- inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const
- {
- if (sp != 0)
- sp = alphabetrank->rank(0, sp - 1);
- ep = alphabetrank->rank(0, ep);
- if (ep == 0)
- return document_result();
-
- return Doc->access(sp, ep-1, min, max);
- }
-
- /**
- * Enumerate documents in given interval [sp, ep]
- */
- inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep) const
- {
- // We want unique document indentifiers, using std::set to collect them
- // FIXME use unordered_set?
- uint tmp_rank_c = 0; // Cache rank value of c.
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- uchar c = alphabetrank->access(i, tmp_rank_c);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i, tmp_rank_c);
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
- resultSet.insert(Doc->access(endmarkerRank));
- }
- else
- {
- DocId di = (*suffixDocId)[sampled->rank1(i)-1];
- assert((unsigned)di < numberOfTexts);
- resultSet.insert(di);
- }
- }
- }
-
- /**
- * Enumerate documents in given interval [sp, ep]
- * and within [begin, end]
- */
- inline void EnumerateDocuments(std::set<DocId> &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
- {
- // We want unique document indentifiers, using std::set to collect them
- uint tmp_rank_c = 0; // Cache rank value of c.
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- uchar c = alphabetrank->access(i, tmp_rank_c);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i, tmp_rank_c);
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
- DocId docId = Doc->access(endmarkerRank);
- if (docId >= begin && docId <= end)
- resultSet.insert(docId);
- }
- else
- {
- DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
- assert((unsigned)docId < numberOfTexts);
- if (docId >= begin && docId <= end)
- resultSet.insert(docId);
- }
- }
- }
-
- /**
- * Enumerate document+position pairs (full_result) of
- * each suffix in given interval.
- */
- inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const
- {
- uint tmp_rank_c = 0; // Cache rank value of c.
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- TextPosition dist = 0;
- uchar c = alphabetrank->access(i, tmp_rank_c);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i, tmp_rank_c);
- ++ dist;
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
- DocId docId = Doc->access(endmarkerRank);
- result.push_back(std::make_pair(docId, dist));
- }
- else
- {
- TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
- DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
-
- result.push_back(std::make_pair(docId, textPos));
- }
- }
- }
-
- /**
- * Enumerate document+position pairs (full_result) of
- * each suffix in given interval and within [begin, end].
- */
- inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const
- {
- uint tmp_rank_c = 0; // Cache rank value of c.
- for (; sp <= ep; ++sp)
- {
- TextPosition i = sp;
- TextPosition dist = 0;
- uchar c = alphabetrank->access(i, tmp_rank_c);
- while (c != '\0' && !sampled->access(i))
- {
- i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1;
- c = alphabetrank->access(i, tmp_rank_c);
- ++ dist;
- }
- if (c == '\0')
- {
- // Rank among the end-markers in BWT
- unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1;
- DocId docId = Doc->access(endmarkerRank);
- if (docId >= begin && docId <= end)
- result.push_back(std::make_pair(docId, dist));
- }
- else
- {
- TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist;
- DocId docId = (*suffixDocId)[sampled->rank1(i)-1];
-
- if (docId >= begin && docId <= end)
- result.push_back(std::make_pair(docId, textPos));
- }
- }
- }
-
-}; // class TCImplementation
-
-} // namespace SXSI
-
-#endif