X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=TCImplementation.h;fp=TCImplementation.h;h=0000000000000000000000000000000000000000;hb=89dc22aee980ba16f757cd9a7f77478c2da50051;hp=ff5dff3fc744884778fd3508f98db7d1d64a1156;hpb=443151511a86083b21c1c06eb610f86b3aed35be;p=SXSI%2FTextCollection.git diff --git a/TCImplementation.h b/TCImplementation.h deleted file mode 100644 index ff5dff3..0000000 --- a/TCImplementation.h +++ /dev/null @@ -1,386 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * - * * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - *****************************************************************************/ - -#ifndef _TCImplementation_H_ -#define _TCImplementation_H_ - -#include "incbwt/bits/deltavector.h" - -#include "BitRank.h" -#include "TextCollection.h" -#include "BlockArray.h" - -// Include from XMLTree/libcds -#include // Defines W == 32 -#include -#include -#include -#include - -// Re-define word size to ulong: -#undef W -#if __WORDSIZE == 64 -# define W 64 -#else -# define W 32 -#endif -#undef bitset -#undef bitget - - -#include "TextStorage.h" -#include "ArrayDoc.h" -#include -#include - -namespace SXSI -{ - - -/** - * Implementation of the TextCollection interface - * - */ -class TCImplementation : public SXSI::TextCollection { -public: - TCImplementation(uchar *, ulong, unsigned, unsigned, ulong, ulong, - CSA::DeltaVector &, const std::string &, char); - ~TCImplementation(); - - bool EmptyText(DocId) const; - - /** - * Extracting one text. - * - * Call DeleteText() for each pointer returned by GetText() - * to avoid possible memory leaks. - */ - uchar * GetText(DocId) const; - void DeleteText(uchar *text) const - { textStorage->DeleteText(text); } - - /** - * Returns a pointer to the beginning of texts i, i+1, ..., j. - * Texts are separated by a '\0' byte. - * - * Call DeleteText() for each pointer returned by GetText() - * to avoid possible memory leaks. - */ - uchar * GetText(DocId i, DocId j) const - { return textStorage->GetText(i, j); } - - /** - * Returns a substring of given text ID. - * - * FIXME This may be reimplemented via TextStorage. - */ -// uchar* GetText(DocId, TextPosition, TextPosition) const; - - bool IsPrefix(uchar const *) const; - bool IsSuffix(uchar const *) const; - bool IsEqual(uchar const *) const; - bool IsContains(uchar const *) const; - bool IsLessThan(uchar const *) const; - - bool IsPrefix(uchar const *, DocId, DocId) const; - bool IsSuffix(uchar const *, DocId, DocId) const; - bool IsEqual(uchar const *, DocId, DocId) const; - bool IsContains(uchar const *, DocId, DocId) const; - bool IsLessThan(uchar const *, DocId, DocId) const; - - ulong Count(uchar const *) const; - unsigned CountPrefix(uchar const *) const; - unsigned CountSuffix(uchar const *) const; - unsigned CountEqual(uchar const *) const; - unsigned CountContains(uchar const *) const; - unsigned CountLessThan(const unsigned char*) const; - - unsigned CountPrefix(uchar const *, DocId, DocId) const; - unsigned CountSuffix(uchar const *, DocId, DocId) const; - unsigned CountEqual(uchar const *, DocId, DocId) const; - unsigned CountContains(uchar const *, DocId, DocId) const; - unsigned CountLessThan(uchar const *, DocId, DocId) const; - - // Definition of document_result is inherited from SXSI::TextCollection. - document_result Prefix(uchar const *) const; - document_result Suffix(uchar const *) const; - document_result Equal(uchar const *) const; - document_result Contains(uchar const *) const; - document_result LessThan(uchar const *) const; - document_result KMismaches(uchar const *, unsigned) const; - document_result KErrors(uchar const *, unsigned) const; - - document_result Prefix(uchar const *, DocId, DocId) const; - document_result Suffix(uchar const *, DocId, DocId) const; - document_result Equal(uchar const *, DocId, DocId) const; - document_result Contains(uchar const *, DocId, DocId) const; - document_result LessThan(uchar const *, DocId, DocId) const; - - // Definition of full_result is inherited from SXSI::TextCollection. - full_result FullContains(uchar const *) const; - full_result FullContains(uchar const *, DocId, DocId) const; - full_result FullKMismatches(uchar const *, unsigned) const; - full_result FullKErrors(uchar const *, unsigned) const; - - // Index from/to disk - TCImplementation(FILE *, index_mode_t, unsigned); - void Save(FILE *) const; - -private: - typedef std::vector > suffix_range_vector; - - static const uchar versionFlag; - TextPosition n; - unsigned samplerate; - unsigned C[256]; - TextPosition bwtEndPos; - static_sequence * alphabetrank; - - // Sample structures for texts longer than samplerate - static_bitsequence * sampled; - BlockArray *suffixes; - BlockArray *suffixDocId; - - // Total number of texts in the collection - unsigned numberOfTexts; - // Length of the longest text - ulong maxTextLength; - - // Array of document id's in the order of end-markers in BWT -// static_sequence *Doc; - ArrayDoc *Doc; - - // Text storage for fast extraction - TextStorage * textStorage; - - // Following methods are not part of the public API - uchar * BWT(uchar *); - void makewavelet(uchar *); - void maketables(ulong, char, CSA::DeltaVector &, const std::string &); - DocId DocIdAtTextPos(BlockArray*, TextPosition) const; - ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; - ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - ulong searchPrefix(uchar const *pattern, ulong i, ulong *sp, ulong *ep) const; - ulong kmismatches(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned) const; - ulong kerrors(suffix_range_vector &, uchar const *, ulong, ulong, ulong, unsigned, ulong const *, ulong) const; - /** - * Count end-markers in given interval - */ - inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const - { - if (sp > ep) - return 0; - - ulong ranksp = 0; - if (sp != 0) - ranksp = alphabetrank->rank(0, sp - 1); - - return alphabetrank->rank(0, ep) - ranksp; - } - - /** - * Count end-markers in given interval and - * within docIds [min,max] - */ - inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const - { - if (sp != 0) - sp = alphabetrank->rank(0, sp - 1); - ep = alphabetrank->rank(0, ep); - if (ep == 0) - return 0; - - return Doc->count(sp, ep-1, min, max); - } - - /** - * Enumerate all end-markers in given interval - */ - inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep) const - { - if (sp != 0) - sp = alphabetrank->rank(0, sp - 1); - ep = alphabetrank->rank(0, ep); - if (ep == 0) - return document_result(); - - return Doc->accessAll(sp, ep-1); - } - - /** - * Enumerate end-markers in given interval and - * within docIds [min,max] - */ - inline document_result EnumerateEndmarkers(TextPosition sp, TextPosition ep, DocId min, DocId max) const - { - if (sp != 0) - sp = alphabetrank->rank(0, sp - 1); - ep = alphabetrank->rank(0, ep); - if (ep == 0) - return document_result(); - - return Doc->access(sp, ep-1, min, max); - } - - /** - * Enumerate documents in given interval [sp, ep] - */ - inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep) const - { - // We want unique document indentifiers, using std::set to collect them - // FIXME use unordered_set? - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - resultSet.insert(Doc->access(endmarkerRank)); - } - else - { - DocId di = (*suffixDocId)[sampled->rank1(i)-1]; - assert((unsigned)di < numberOfTexts); - resultSet.insert(di); - } - } - } - - /** - * Enumerate documents in given interval [sp, ep] - * and within [begin, end] - */ - inline void EnumerateDocuments(std::set &resultSet, TextPosition sp, TextPosition ep, DocId begin, DocId end) const - { - // We want unique document indentifiers, using std::set to collect them - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - if (docId >= begin && docId <= end) - resultSet.insert(docId); - } - else - { - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - assert((unsigned)docId < numberOfTexts); - if (docId >= begin && docId <= end) - resultSet.insert(docId); - } - } - } - - /** - * Enumerate document+position pairs (full_result) of - * each suffix in given interval. - */ - inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep) const - { - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - TextPosition dist = 0; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - ++ dist; - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - result.push_back(std::make_pair(docId, dist)); - } - else - { - TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - - result.push_back(std::make_pair(docId, textPos)); - } - } - } - - /** - * Enumerate document+position pairs (full_result) of - * each suffix in given interval and within [begin, end]. - */ - inline void EnumeratePositions(full_result &result, TextPosition sp, TextPosition ep, DocId begin, DocId end) const - { - uint tmp_rank_c = 0; // Cache rank value of c. - for (; sp <= ep; ++sp) - { - TextPosition i = sp; - TextPosition dist = 0; - uchar c = alphabetrank->access(i, tmp_rank_c); - while (c != '\0' && !sampled->access(i)) - { - i = C[c]+tmp_rank_c-1; //alphabetrank->rank(c,i)-1; - c = alphabetrank->access(i, tmp_rank_c); - ++ dist; - } - if (c == '\0') - { - // Rank among the end-markers in BWT - unsigned endmarkerRank = tmp_rank_c-1; //alphabetrank->rank(0, i) - 1; - DocId docId = Doc->access(endmarkerRank); - if (docId >= begin && docId <= end) - result.push_back(std::make_pair(docId, dist)); - } - else - { - TextPosition textPos = (*suffixes)[sampled->rank1(i)-1] + dist; - DocId docId = (*suffixDocId)[sampled->rank1(i)-1]; - - if (docId >= begin && docId <= end) - result.push_back(std::make_pair(docId, textPos)); - } - } - } - -}; // class TCImplementation - -} // namespace SXSI - -#endif