X-Git-Url: http://git.nguyen.vg/gitweb/?a=blobdiff_plain;f=CSA.h;fp=CSA.h;h=0000000000000000000000000000000000000000;hb=40ddf9aca842bdc081b6350a4ebfe36b066c94c9;hp=eab3637c4052d0226c37170f100017d4593db17d;hpb=af8938dbee21244687184dd0502a84ce1af70c50;p=SXSI%2FTextCollection.git diff --git a/CSA.h b/CSA.h deleted file mode 100644 index eab3637..0000000 --- a/CSA.h +++ /dev/null @@ -1,352 +0,0 @@ -/****************************************************************************** - * Copyright (C) 2006-2008 by Veli Mäkinen and Niko Välimäki * - * * - * * - * This program is free software; you can redistribute it and/or modify * - * it under the terms of the GNU Lesser General Public License as published * - * by the Free Software Foundation; either version 2 of the License, or * - * (at your option) any later version. * - * * - * This program is distributed in the hope that it will be useful, * - * but WITHOUT ANY WARRANTY; without even the implied warranty of * - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * - * GNU Lesser General Public License for more details. * - * * - * You should have received a copy of the GNU Lesser General Public License * - * along with this program; if not, write to the * - * Free Software Foundation, Inc., * - * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. * - *****************************************************************************/ - -#ifndef _CSA_H_ -#define _CSA_H_ -#include "dynFMI.h" -#include "BitRank.h" -#include "TextCollection.h" -#include "BlockArray.h" -#include "RLWaveletTree.h" -#include "StringIterator.h" -#include -#include - -// Include from XMLTree/libcds -#include // Defines W == 32 -#include -#include -#include - -// Re-define word size to ulong: -#undef W -#if __WORDSIZE == 64 -# define W 64 -#else -# define W 32 -#endif -#undef bitset - -namespace SXSI -{ - -// Un-comment to compare BWT against a BWT generated from class dynFMI: -//#define CSA_TEST_BWT - -/** - * Implementation of the TextCollection interface - * - */ -class CSA : public SXSI::TextCollection { -public: - /** - * Constructor with (default) samplerate - */ - explicit CSA(unsigned); - ~CSA(); - /** - * Following functions implement the interface described in TextCollection.h. - * For details/documentation, see TextCollection.h. - */ - void InsertText(uchar const *); - void MakeStatic(); - bool EmptyText(DocId) const; - uchar* GetText(DocId) const; - /** - * Next method is not supported: - * Supporting GetText for some substring [i,j] - * would require more space. - */ -// uchar* GetText(DocId, TextPosition, TextPosition) const; - - bool IsPrefix(uchar const *) const; - bool IsSuffix(uchar const *) const; - bool IsEqual(uchar const *) const; - bool IsContains(uchar const *) const; - bool IsLessThan(uchar const *) const; - - bool IsPrefix(uchar const *, DocId, DocId) const; - bool IsSuffix(uchar const *, DocId, DocId) const; - bool IsEqual(uchar const *, DocId, DocId) const; - bool IsContains(uchar const *, DocId, DocId) const; - bool IsLessThan(uchar const *, DocId, DocId) const; - - ulong Count(uchar const *) const; - unsigned CountPrefix(uchar const *) const; - unsigned CountSuffix(uchar const *) const; - unsigned CountEqual(uchar const *) const; - unsigned CountContains(uchar const *) const; - unsigned CountLessThan(const unsigned char*) const; - - unsigned CountPrefix(uchar const *, DocId, DocId) const; - unsigned CountSuffix(uchar const *, DocId, DocId) const; - unsigned CountEqual(uchar const *, DocId, DocId) const; - unsigned CountContains(uchar const *, DocId, DocId) const; - unsigned CountLessThan(uchar const *, DocId, DocId) const; - - // Definition of document_result is inherited from SXSI::TextCollection. - document_result Prefix(uchar const *) const; - document_result Suffix(uchar const *) const; - document_result Equal(uchar const *) const; - document_result Contains(uchar const *) const; - document_result LessThan(uchar const *) const; - - document_result Prefix(uchar const *, DocId, DocId) const; - document_result Suffix(uchar const *, DocId, DocId) const; - document_result Equal(uchar const *, DocId, DocId) const; - document_result Contains(uchar const *, DocId, DocId) const; - document_result LessThan(uchar const *, DocId, DocId) const; - - // Definition of full_result is inherited from SXSI::TextCollection. - full_result FullContains(uchar const *) const; - full_result FullContains(uchar const *, DocId, DocId) const; - - // Index from/to disk - void Load(FILE *, unsigned); - void Save(FILE *) const; - - - // FIXME Remove - void deleteWT() - { - delete alphabetrank; - alphabetrank = 0; - delete [] codetable; - codetable = 0; - } - void deleteSamples() - { - delete sampled; - sampled =0; - delete suffixes; - suffixes = 0; - delete suffixDocId; - suffixDocId = 0; - } - void deleteEndmarker() - { - delete endmarkerDocId; - endmarkerDocId = 0; - } - -private: - // FIXME Unused code - class TCodeEntry { - public: - unsigned count; - unsigned bits; - unsigned code; - TCodeEntry() {count=0;bits=0;code=0u;}; - }; - - - // FIXME Unused code - class THuffAlphabetRank { - // using fixed 0...255 alphabet - private: - BitRank *bitrank; - THuffAlphabetRank *left; - THuffAlphabetRank *right; - TCodeEntry *codetable; - uchar ch; - bool leaf; - public: - THuffAlphabetRank(uchar *, TextPosition, TCodeEntry *, unsigned); - THuffAlphabetRank(FILE *); - ~THuffAlphabetRank(); - - void Save(FILE *); - bool Test(uchar *, TextPosition); - - inline TextPosition rank(int c, TextPosition i) const { // returns the number of characters c before and including position i - THuffAlphabetRank const * temp=this; - if (codetable[c].count == 0) return 0; - unsigned level = 0; - unsigned code = codetable[c].code; - while (!temp->leaf) { - if ((code & (1u<bitrank->rank(i); - temp = temp->left; - } - else { - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } - ++level; - } - return i+1; - }; - inline bool IsCharAtPos(int c, TextPosition i) const { - THuffAlphabetRank const * temp=this; - if (codetable[c].count == 0) return false; - unsigned level = 0; - unsigned code = codetable[c].code; - while (!temp->leaf) { - if ((code & (1u<bitrank->IsBitSet(i)) return false; - i = i-temp->bitrank->rank(i); - temp = temp->left; - } - else { - if (!temp->bitrank->IsBitSet(i)) return false; - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } - ++level; - } - return true; - } - inline uchar access(TextPosition i) const { - THuffAlphabetRank const * temp=this; - while (!temp->leaf) { - if (temp->bitrank->IsBitSet(i)) { - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } - else { - i = i-temp->bitrank->rank(i); - temp = temp->left; - } - } - return temp->ch; - } - - inline uchar charAtPos(ulong i, TextPosition *rank) const { - THuffAlphabetRank const * temp=this; - while (!temp->leaf) { - if (temp->bitrank->IsBitSet(i)) { - i = temp->bitrank->rank(i)-1; - temp = temp->right; - } else { - i = i-temp->bitrank->rank(i); - temp = temp->left; - } - } - (*rank)=i; - return temp->ch; - } - }; - - // FIXME Unused code - class node { - private: - unsigned weight; - uchar value; - node *child0; - node *child1; - - void maketable( unsigned code, unsigned bits, TCodeEntry *codetable ) const; - static void count_chars(uchar *, TextPosition , TCodeEntry *); - static unsigned SetBit(unsigned , unsigned , unsigned ); - public: - node( unsigned char c = 0, unsigned i = 0 ) { - value = c; - weight = i; - child0 = 0; - child1 = 0; - } - - node( node* c0, node *c1 ) { - value = 0; - weight = c0->weight + c1->weight; - child0 = c0; - child1 = c1; - } - - - bool operator>( const node &a ) const { - return weight > a.weight; - } - - static TCodeEntry *makecodetable(uchar *, TextPosition); - }; - - static const uchar versionFlag; - TextPosition n; - unsigned samplerate; - unsigned C[256]; - TextPosition bwtEndPos; -// THuffAlphabetRank *alphabetrank; - // RLWaveletTree *alphabetrank; - static_sequence * alphabetrank; - -#ifdef CSA_TEST_BWT - DynFMI *dynFMI; -#endif - TCodeEntry *codetable; - - // Sample structures for texts longer than samplerate - BSGAP *sampled; - BlockArray *suffixes; - BlockArray *suffixDocId; - - // Total number of texts in the collection - unsigned numberOfTexts; - // Total number of texts including empty texts - unsigned numberOfAllTexts; - // Length of the longest text - ulong maxTextLength; - - // Array of document id's in the order of end-markers in BWT - // Access by endmarkerDocId[rank_$(L, p) - 1]. - BlockArray *endmarkerDocId; - - // FIXME Replace with a more succinct data structure - // Note: Empty texts are already handled inside XMLTree class. - std::set emptyTextId; - BSGAP *emptyTextRank; - - // FIXME A better solution? - std::string texts; - - // Following are not part of the public API - uchar * BWT(uchar *); - void makewavelet(uchar *); - void maketables(); - DocId DocIdAtTextPos(BlockArray*, TextPosition) const; - ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *) const; - ulong Search(uchar const *, TextPosition, TextPosition *, TextPosition *, DocId, DocId) const; - ulong SearchLessThan(uchar const *, TextPosition, TextPosition *, TextPosition *) const; -// TextPosition Inverse(TextPosition) const; -// TextPosition LF(uchar c, TextPosition &sp, TextPosition &ep) const; -// TextPosition Psi(TextPosition) const; -// uchar * Substring(TextPosition, TextPosition) const; -// TextPosition Lookup(TextPosition) const; - - /** - * Count end-markers in given interval - */ - inline unsigned CountEndmarkers(TextPosition sp, TextPosition ep) const - { - if (sp > ep) - return 0; - - ulong ranksp = 0; - if (sp != 0) - ranksp = alphabetrank->rank(0, sp - 1); - - return alphabetrank->rank(0, ep) - ranksp; - } - unsigned CountEndmarkers(TextPosition, TextPosition, DocId, DocId) const; -}; // class CSA - -} // namespace SXSI - -#endif